### Title: 00_build_wweia_dataset_day_1_2
### Purpose: Build out the WWEIA dataset by intergrating the covariates / inclusion criteria / CRP data
### Date: March 14, 2024
### Author: Jules Larke

In [1]:
# Education, Age, and Sex are already in wweia_ingredients

In [2]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [3]:
# Load data for WWEIA_ingredients
# Dataset (wweia_all_recalls.txt) can be generated from https://github.com/JulesLarke-USDA/wweia_ingredients
# and placed in the proper directory for loading

wweia = pd.read_csv('/Users/jules.larke/work/github/wweia_ingredients/data/04/wweia_all_recalls.txt', sep='\t')

In [4]:
wweia.shape

(4657896, 81)

In [131]:
wweia

Unnamed: 0,SEQN,ingred_code,ingred_desc,Ingred_consumed_g,Capric acid,Lauric acid,Myristic acid,Palmitic acid,Palmitoleic acid,Stearic acid,...,RIDRETH1,INDFMPIR,DMDEDUC3,DMDEDUC2,WTINT2YR,WTMEC2YR,SDMVPSU,SDMVSTRA,CYCLE,diet_wts
0,9966.0,1048.0,"Cheese spread, pasteurized process, American",21.265000,0.109089,0.134395,4.637896e-01,1.256336,1.190840e-01,5.175901e-01,...,Non-Hispanic_White,2.93,,4.0,85045.160060,91352.991726,2.0,22.0,01_02,52579.786405
1,9966.0,1049.0,"Cream, fluid, half and half",22.690000,0.068751,0.079642,2.604812e-01,0.727668,5.581740e-02,2.770449e-01,...,Non-Hispanic_White,2.93,,4.0,85045.160060,91352.991726,2.0,22.0,01_02,52579.786405
2,9966.0,1079.0,"Milk, reduced fat, fluid, 2% milkfat, with add...",122.000000,0.059780,0.067100,2.135000e-01,0.680760,3.294000e-02,2.964600e-01,...,Non-Hispanic_White,2.93,,4.0,85045.160060,91352.991726,2.0,22.0,01_02,52579.786405
3,9966.0,1111.0,"Milk, averaged fat, with added vitamin A and D",3.193870,0.001222,0.001294,4.559249e-03,0.013566,3.752797e-04,5.932614e-03,...,Non-Hispanic_White,2.93,,4.0,85045.160060,91352.991726,2.0,22.0,01_02,52579.786405
4,9966.0,1124.0,"Egg, white, raw, fresh",0.295550,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,...,Non-Hispanic_White,2.93,,4.0,85045.160060,91352.991726,2.0,22.0,01_02,52579.786405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4657891,102956.0,19335.0,"Sugars, granulated",58.335000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,...,Non-Hispanic_White,1.56,,4.0,38645.740291,39426.299948,1.0,142.0,17_18,92756.884416
4657892,102956.0,19350.0,"Syrups, corn, light",1.355040,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,...,Non-Hispanic_White,1.56,,4.0,38645.740291,39426.299948,1.0,142.0,17_18,92756.884416
4657893,102956.0,20027.0,Cornstarch,0.049210,0.000000,0.000000,0.000000e+00,0.000004,0.000000e+00,4.921026e-07,...,Non-Hispanic_White,1.56,,4.0,38645.740291,39426.299948,1.0,142.0,17_18,92756.884416
4657894,102956.0,20061.0,"Rice flour, white, unenriched",0.000879,0.000000,0.000000,7.030037e-08,0.000003,4.393773e-08,2.284762e-07,...,Non-Hispanic_White,1.56,,4.0,38645.740291,39426.299948,1.0,142.0,17_18,92756.884416


In [5]:
wweia.SEQN.nunique()

79298

In [6]:
# subset to include 18+ y/o
adults = wweia[wweia['RIDAGEYR'] > 17]

In [7]:
adults.rename(columns={'RIDAGEYR': 'Age'},inplace=True)
adults.rename(columns={'RIAGENDR': 'Sex'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adults.rename(columns={'RIDAGEYR': 'Age'},inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adults.rename(columns={'RIAGENDR': 'Sex'},inplace=True)


### Education

In [8]:
adults.loc[adults.SEQN==16247, 'DMDEDUC2'] = 9

In [9]:
# recode edu levels for age 19 and under
adults['DMDEDUC3'] = adults['DMDEDUC3'].replace([13, 14, 15], 'high school graduate or equivalent')
adults['DMDEDUC3'] = adults['DMDEDUC3'].replace([9, 10, 11, 12, 66, 99], 'less than high school graduate')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adults['DMDEDUC3'] = adults['DMDEDUC3'].replace([13, 14, 15], 'high school graduate or equivalent')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adults['DMDEDUC3'] = adults['DMDEDUC3'].replace([9, 10, 11, 12, 66, 99], 'less than high school graduate')


In [10]:
# recode edu levels for age 20 and older
adults['DMDEDUC2'] = adults['DMDEDUC2'].replace([1, 2], 'less than high school graduate')
adults['DMDEDUC2'] = adults['DMDEDUC2'].replace(3, 'high school graduate or equivalent')
adults['DMDEDUC2'] = adults['DMDEDUC2'].replace(4, 'some college')
adults['DMDEDUC2'] = adults['DMDEDUC2'].replace(5, 'college graduate')
adults['DMDEDUC2'] = adults['DMDEDUC2'].replace([7, 9], 'unknown')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adults['DMDEDUC2'] = adults['DMDEDUC2'].replace([1, 2], 'less than high school graduate')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adults['DMDEDUC2'] = adults['DMDEDUC2'].replace(3, 'high school graduate or equivalent')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adults['DMDEDUC2'] = adult

In [11]:
# create single feature for education including all participants
adults['education'] = adults.filter(like='DMDEDUC').ffill(axis=1).iloc[:,-1].copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adults['education'] = adults.filter(like='DMDEDUC').ffill(axis=1).iloc[:,-1].copy()


In [12]:
adults = adults.drop(columns=['DMDEDUC2', 'DMDEDUC3'])

### BMI

In [13]:
# Exam data - Body Measures (BMI: BMXBMI)
bmx_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/BMX_B.XPT', format='xport', encoding='utf-8')
bmx_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/BMX_C.XPT', format='xport', encoding='utf-8')
bmx_D = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/BMX_D.XPT', format='xport', encoding='utf-8')
bmx_E = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/BMX_E.XPT', format='xport', encoding='utf-8')
bmx_F = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/BMX_F.XPT', format='xport', encoding='utf-8')
bmx_G = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/BMX_G.XPT', format='xport', encoding='utf-8')
bmx_H = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/BMX_H.XPT', format='xport', encoding='utf-8')
bmx_I = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/BMX_I.XPT', format='xport', encoding='utf-8')
bmx_J = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/BMX_J.XPT', format='xport', encoding='utf-8')

In [14]:
bmx_B = bmx_B[['SEQN', 'BMXBMI', 'BMXWT', 'BMXWAIST']]
bmx_C = bmx_C[['SEQN', 'BMXBMI', 'BMXWT', 'BMXWAIST']]
bmx_D = bmx_D[['SEQN', 'BMXBMI', 'BMXWT', 'BMXWAIST']]
bmx_E = bmx_E[['SEQN', 'BMXBMI', 'BMXWT', 'BMXWAIST']]
bmx_F = bmx_F[['SEQN', 'BMXBMI', 'BMXWT', 'BMXWAIST']]
bmx_G = bmx_G[['SEQN', 'BMXBMI', 'BMXWT', 'BMXWAIST']]
bmx_H = bmx_H[['SEQN', 'BMXBMI', 'BMXWT', 'BMXWAIST']]
bmx_I = bmx_I[['SEQN', 'BMXBMI', 'BMXWT', 'BMXWAIST']]
bmx_J = bmx_J[['SEQN', 'BMXBMI', 'BMXWT', 'BMXWAIST']]

bmi = pd.concat([bmx_B, bmx_C, bmx_D, bmx_E, bmx_F, bmx_G, bmx_H, bmx_I, bmx_J])
bmi.rename(columns={'BMXBMI': 'BMI', 'BMXWT': 'body_wt', 'BMXWAIST': 'WC'}, inplace=True)

In [15]:
adults = adults.merge(bmi, on='SEQN', how='left')

### Impute BMI from WC for those who have WC measured (197)

In [16]:
no_bmi = adults[adults['BMI'].isnull()]

In [17]:
no_bmi = no_bmi[['SEQN', 'Sex', 'BMI', 'WC']]

In [18]:
no_bmi = no_bmi.drop_duplicates(subset=['SEQN'])

In [19]:
no_bmi = no_bmi[~no_bmi['WC'].isnull()]

In [20]:
no_bmi_m = no_bmi[no_bmi['Sex']=='Male']
no_bmi_f = no_bmi[no_bmi['Sex']=='Female']

In [21]:
bmi_wc = adults[['SEQN', 'Sex', 'BMI', 'WC']]

In [22]:
bmi_wc = bmi_wc.drop_duplicates(subset=['SEQN'])

In [23]:
bmi_wc = bmi_wc.dropna(subset=['BMI', 'WC'])

In [24]:
bmi_wc_m = bmi_wc[bmi_wc['Sex']=='Male']

In [25]:
bmi_wc_f = bmi_wc[bmi_wc['Sex']=='Female']

In [26]:
x_m = np.array(bmi_wc_m['WC']).reshape((-1, 1))
y_m = np.array(bmi_wc_m['BMI']).reshape((-1, 1))
z_m = np.array(no_bmi_m['WC']).reshape((-1, 1))

x_f = np.array(bmi_wc_f['WC']).reshape((-1, 1))
y_f = np.array(bmi_wc_f['BMI']).reshape((-1, 1))
z_f = np.array(no_bmi_f['WC']).reshape((-1, 1))

In [27]:
model_m = LinearRegression().fit(x_m, y_m)

In [28]:
r_sq = model_m.score(x_m, y_m)
print(f"coefficient of determination: {r_sq}")
print(f"intercept: {model_m.intercept_}")
print(f"slope: {model_m.coef_}")

coefficient of determination: 0.8461023921101636
intercept: [-5.87308117]
slope: [[0.34191912]]


In [29]:
y_pred_m = model_m.predict(z_m)
print(f"predicted response:\n{y_pred_m}")

predicted response:
[[28.01110387]
 [26.81438694]
 [27.15630606]
 [24.11322587]
 [29.82327521]
 [35.6359003 ]
 [25.61767001]
 [27.73756857]
 [30.47292155]
 [31.80640612]
 [28.01110387]
 [29.54973992]
 [26.09635678]
 [18.09544931]
 [30.47292155]
 [24.04484205]
 [29.58393183]
 [22.19847878]
 [22.91650894]
 [24.7628722 ]
 [27.01953841]
 [24.86544794]
 [27.73756857]
 [21.00176186]
 [38.7131724 ]
 [32.72958775]
 [38.09771798]
 [25.92539722]
 [31.12256788]
 [21.00176186]
 [22.67716556]
 [36.93519296]
 [33.55019365]
 [30.71226493]
 [28.25044725]
 [26.60923546]
 [34.50756719]
 [26.81438694]
 [25.51509427]
 [24.4893369 ]
 [26.57504355]
 [32.69539584]
 [28.45559872]
 [31.66963848]
 [30.40453772]
 [22.67716556]
 [33.78953703]
 [28.59236637]
 [25.5834781 ]
 [32.0115576 ]
 [21.1385295 ]
 [30.3361539 ]
 [31.53287083]
 [28.69494211]
 [26.60923546]
 [31.7380223 ]
 [34.98625396]
 [30.81484067]
 [29.17362888]
 [33.61857747]
 [29.61812374]
 [23.6345391 ]
 [48.04756444]
 [27.36145753]
 [28.07948769]
 [32.

In [30]:
model_f = LinearRegression().fit(x_f, y_f)

In [31]:
r_sq = model_f.score(x_f, y_f)
print(f"coefficient of determination: {r_sq}")
print(f"intercept: {model_f.intercept_}")
print(f"slope: {model_f.coef_}")

coefficient of determination: 0.8384576719614756
intercept: [-9.98255724]
slope: [[0.40583216]]


In [32]:
y_pred_f = model_m.predict(z_f)
print(f"predicted response:\n{y_pred_f}")

predicted response:
[[21.99332731]
 [30.02842669]
 [21.61721628]
 [35.32817309]
 [30.71226493]
 [20.72822656]
 [22.36943835]
 [20.557267  ]
 [28.18206343]
 [43.53423202]
 [32.52443628]
 [21.00176186]
 [28.14787151]
 [28.79751785]
 [35.73847603]
 [20.86499421]
 [25.72024575]
 [41.38014155]
 [28.90009358]
 [31.05418405]
 [31.7380223 ]
 [27.29307371]
 [32.07994142]
 [21.48044863]
 [34.98625396]
 [41.58529303]
 [27.49822518]
 [35.22559735]
 [37.65322312]
 [33.9604966 ]
 [22.33524643]
 [30.19938625]
 [27.97691195]
 [26.30150825]
 [27.08792223]
 [31.08837597]
 [29.41297227]
 [18.91605521]
 [17.10388386]
 [41.03822243]
 [31.60125465]
 [36.62746575]
 [28.48979064]
 [29.515548  ]
 [29.58393183]
 [22.19847878]
 [30.13100242]
 [32.04574951]
 [22.43782217]
 [21.07014568]
 [27.53241709]
 [29.68650757]
 [28.79751785]
 [26.13054869]
 [29.54973992]
 [32.04574951]
 [28.96847741]
 [29.03686123]
 [25.0364075 ]
 [19.77085301]
 [32.31928481]
 [23.22423615]
 [31.05418405]
 [45.4147872 ]
 [35.0888297 ]
 [21.

In [33]:
no_bmi_m['BMI'] = y_pred_m.flatten()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_bmi_m['BMI'] = y_pred_m.flatten()


In [34]:
no_bmi_f['BMI'] = y_pred_f.flatten()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_bmi_f['BMI'] = y_pred_f.flatten()


In [35]:
no_bmi = pd.concat([no_bmi_m, no_bmi_f])

In [36]:
adults.set_index('SEQN', inplace=True)

In [37]:
no_bmi.set_index('SEQN', inplace=True)

In [38]:
adults['BMI'].update(no_bmi['BMI'])

In [39]:
adults.dropna(subset='BMI', inplace=True)

In [40]:
adults.reset_index(inplace=True)

### Smoking status

In [41]:
# Questionnaire data - Current or ever smoker
smq_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/SMQ_B.XPT', format='xport', encoding='utf-8')
smq_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/SMQ_C.XPT', format='xport', encoding='utf-8')
smq_D = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/SMQ_D.XPT', format='xport', encoding='utf-8')
smq_E = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/SMQ_E.XPT', format='xport', encoding='utf-8')
smq_F = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/SMQ_F.XPT', format='xport', encoding='utf-8')
smq_G = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/SMQ_G.XPT', format='xport', encoding='utf-8')
smq_H = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/SMQ_H.XPT', format='xport', encoding='utf-8')
smq_I = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/SMQ_I.XPT', format='xport', encoding='utf-8')
smq_J = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/SMQ_J.XPT', format='xport', encoding='utf-8')

In [42]:
smq_B = smq_B[['SEQN', 'SMQ020', 'SMQ040']]
smq_C = smq_C[['SEQN', 'SMQ020', 'SMQ040']]
smq_D = smq_D[['SEQN', 'SMQ020', 'SMQ040']]
smq_E = smq_E[['SEQN', 'SMQ020', 'SMQ040']]
smq_F = smq_F[['SEQN', 'SMQ020', 'SMQ040']]
smq_G = smq_G[['SEQN', 'SMQ020', 'SMQ040']]
smq_H = smq_H[['SEQN', 'SMQ020', 'SMQ040']]
smq_I = smq_I[['SEQN', 'SMQ020', 'SMQ040']]
smq_J = smq_J[['SEQN', 'SMQ020', 'SMQ040']]

smq = pd.concat([smq_B, smq_C, smq_D, smq_E, smq_F, smq_G, smq_H, smq_I, smq_J])
smq.rename(columns={'SMQ020':'ever_smoker', 'SMQ040': 'current_smoker'}, inplace=True)

In [43]:
adults = adults.merge(smq, on='SEQN', how='left')

In [44]:
# recode levels for smoking status
adults['ever_smoker'] = adults['ever_smoker'].replace(1, 'yes')
adults['ever_smoker'] = adults['ever_smoker'].replace(2, 'no')
adults['ever_smoker'] = adults['ever_smoker'].replace([7, 9], 'unknown')

### Diabetes

In [45]:
#Lab data - fasting glucose

fg_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/L10AM_B.XPT', format='xport', encoding='utf-8')
fg_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/L10AM_C.XPT', format='xport', encoding='utf-8')
fg_D = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/GLU_D.XPT', format='xport', encoding='utf-8')
fg_E = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/GLU_E.XPT', format='xport', encoding='utf-8')
fg_F = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/GLU_F.XPT', format='xport', encoding='utf-8')
fg_G = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/GLU_G.XPT', format='xport', encoding='utf-8')
fg_H = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/GLU_H.XPT', format='xport', encoding='utf-8')
fg_I = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/GLU_I.XPT', format='xport', encoding='utf-8')
fg_J = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/GLU_J.XPT', format='xport', encoding='utf-8')

In [46]:
fg_B = fg_B[['SEQN', 'LBXGLU']]
fg_C = fg_C[['SEQN', 'LBXGLU']]
fg_D = fg_D[['SEQN', 'LBXGLU']]
fg_E = fg_E[['SEQN', 'LBXGLU']]
fg_F = fg_F[['SEQN', 'LBXGLU']]
fg_G = fg_G[['SEQN', 'LBXGLU']]
fg_H = fg_H[['SEQN', 'LBXGLU']]
fg_I = fg_I[['SEQN', 'LBXGLU']]
fg_J = fg_J[['SEQN', 'LBXGLU']]

fg = pd.concat([fg_B, fg_C, fg_D, fg_E, fg_F, fg_G, fg_H, fg_I, fg_J])
fg.rename(columns={'LBXGLU':'fasting_glc_mg_dL'}, inplace=True)

In [47]:
adults = adults.merge(fg, on='SEQN', how='left')

In [48]:
adults['diabetes_fasting_glc'] = np.where(adults['fasting_glc_mg_dL'] >= 126, 'yes', 'no')

In [49]:
#Lab data - glycohemoglobic (hba1c)

gh_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/L10_B.XPT', format='xport', encoding='utf-8')
gh_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/L10_C.XPT', format='xport', encoding='utf-8')
gh_D = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/GHB_D.XPT', format='xport', encoding='utf-8')
gh_E = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/GHB_E.XPT', format='xport', encoding='utf-8')
gh_F = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/GHB_F.XPT', format='xport', encoding='utf-8')
gh_G = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/GHB_G.XPT', format='xport', encoding='utf-8')
gh_H = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/GHB_H.XPT', format='xport', encoding='utf-8')
gh_I = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/GHB_I.XPT', format='xport', encoding='utf-8')
gh_J = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/GHB_J.XPT', format='xport', encoding='utf-8')

In [50]:
gh_B = gh_B[['SEQN', 'LBXGH']]
gh_C = gh_C[['SEQN', 'LBXGH']]
gh_D = gh_D[['SEQN', 'LBXGH']]
gh_E = gh_E[['SEQN', 'LBXGH']]
gh_F = gh_F[['SEQN', 'LBXGH']]
gh_G = gh_G[['SEQN', 'LBXGH']]
gh_H = gh_H[['SEQN', 'LBXGH']]
gh_I = gh_I[['SEQN', 'LBXGH']]
gh_J = gh_J[['SEQN', 'LBXGH']]

gh = pd.concat([gh_B, gh_C, gh_D, gh_E, gh_F, gh_G, gh_H, gh_I, gh_J])
gh.rename(columns={'LBXGH':'hba1c_percent'}, inplace=True)

In [51]:
adults = adults.merge(gh, on='SEQN', how='left')

In [52]:
adults['diabetes_hba1c'] = np.where(adults['hba1c_percent'] >= 6.5, 'yes', 'no')

In [53]:
#Questionnaire data - taking insulin or glucose lowering meds

diq_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/DIQ_B.XPT', format='xport', encoding='utf-8')
diq_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/DIQ_C.XPT', format='xport', encoding='utf-8')
diq_D = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/DIQ_D.XPT', format='xport', encoding='utf-8')
diq_E = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/DIQ_E.XPT', format='xport', encoding='utf-8')
diq_F = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/DIQ_F.XPT', format='xport', encoding='utf-8')
diq_G = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/DIQ_G.XPT', format='xport', encoding='utf-8')
diq_H = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DIQ_H.XPT', format='xport', encoding='utf-8')
diq_I = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DIQ_I.XPT', format='xport', encoding='utf-8')
diq_J = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DIQ_J.XPT', format='xport', encoding='utf-8')

In [54]:
diq_B = diq_B[['SEQN', 'DIQ050', 'DIQ070']]
diq_C = diq_C[['SEQN', 'DIQ050', 'DIQ070']]
diq_D = diq_D[['SEQN', 'DIQ050', 'DID070']]
diq_E = diq_E[['SEQN', 'DIQ050', 'DID070']]
diq_F = diq_F[['SEQN', 'DIQ050', 'DIQ070']]
diq_G = diq_G[['SEQN', 'DIQ050', 'DIQ070']]
diq_H = diq_H[['SEQN', 'DIQ050', 'DIQ070']]
diq_I = diq_I[['SEQN', 'DIQ050', 'DIQ070']]
diq_J = diq_J[['SEQN', 'DIQ050', 'DIQ070']]

diq = pd.concat([diq_B, diq_C, diq_D, diq_E, diq_F, diq_G, diq_H, diq_I, diq_J])
diq.rename(columns={'DIQ050':'taking_insulin', 'DIQ070':'taking_diabetic_pills', 'DID070':'taking_diabetic_pills_D_E'}, inplace=True)

In [55]:
adults = adults.merge(diq, on='SEQN', how='left')

In [56]:
# recode levels for taking insulin / diabetes meds
adults['taking_insulin'] = adults['taking_insulin'].replace(1, 'yes')
adults['taking_insulin'] = adults['taking_insulin'].replace(2, 'no')
adults['taking_insulin'] = adults['taking_insulin'].replace([7, 9], 'unknown')

adults['taking_diabetic_pills'] = adults['taking_diabetic_pills'].replace(1, 'yes')
adults['taking_diabetic_pills'] = adults['taking_diabetic_pills'].replace(2, 'no')
adults['taking_diabetic_pills'] = adults['taking_diabetic_pills'].replace([7, 9], 'unknown')

adults['taking_diabetic_pills_D_E'] = adults['taking_diabetic_pills_D_E'].replace(1, 'yes')
adults['taking_diabetic_pills_D_E'] = adults['taking_diabetic_pills_D_E'].replace(2, 'no')
adults['taking_diabetic_pills_D_E'] = adults['taking_diabetic_pills_D_E'].replace([7, 9], 'unknown')

In [57]:
# create single feature for diabetes (yes/no)
adults['diabetes'] = np.where(adults['fasting_glc_mg_dL'] >= 126, 'yes', 'no')
adults['diabetes'] = np.where(adults['hba1c_percent'] >= 6.5, 'yes', adults['diabetes'])
adults['diabetes'] = np.where(adults['taking_insulin'] == 'yes', 'yes', adults['diabetes'])
adults['diabetes'] = np.where(adults['taking_diabetic_pills'] == 'yes', 'yes', adults['diabetes'])
adults['diabetes'] = np.where(adults['taking_diabetic_pills_D_E'] == 'yes', 'yes', adults['diabetes'])
adults['diabetes'] = np.where(adults['taking_insulin'] == 'unknown', 'unknown', adults['diabetes'])
adults['diabetes'] = np.where(adults['taking_diabetic_pills'] == 'unknown', 'unknown', adults['diabetes'])
adults['diabetes'] = np.where(adults['taking_diabetic_pills_D_E'] == 'unknown', 'unknown', adults['diabetes'])

### Serum lipids

In [58]:
#Lab data - serum lipids (triglycerides)

tg_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/L13AM_B.XPT', format='xport', encoding='utf-8')
tg_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/L13AM_C.XPT', format='xport', encoding='utf-8')
tg_D = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/TRIGLY_D.XPT', format='xport', encoding='utf-8')
tg_E = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/TRIGLY_E.XPT', format='xport', encoding='utf-8')
tg_F = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/TRIGLY_F.XPT', format='xport', encoding='utf-8')
tg_G = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/TRIGLY_G.XPT', format='xport', encoding='utf-8')
tg_H = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/TRIGLY_H.XPT', format='xport', encoding='utf-8')
tg_I = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/TRIGLY_I.XPT', format='xport', encoding='utf-8')
tg_J = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/TRIGLY_J.XPT', format='xport', encoding='utf-8')

In [59]:
tg_B = tg_B[['SEQN', 'LBXTR']]
tg_C = tg_C[['SEQN', 'LBXTR']]
tg_D = tg_D[['SEQN', 'LBXTR']]
tg_E = tg_E[['SEQN', 'LBXTR']]
tg_F = tg_F[['SEQN', 'LBXTR']]
tg_G = tg_G[['SEQN', 'LBXTR']]
tg_H = tg_H[['SEQN', 'LBXTR']]
tg_I = tg_I[['SEQN', 'LBXTR']]
tg_J = tg_J[['SEQN', 'LBXTR']]

tg = pd.concat([tg_B, tg_C, tg_D, tg_E, tg_F, tg_G, tg_H, tg_I, tg_J])
tg.rename(columns={'LBXTR':'tg_mg_dL'}, inplace=True)

In [60]:
adults = adults.merge(tg, on='SEQN', how='left')

### Hypertension

In [61]:
#Exam data - blood pressure (hypertension)

bp_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/BPX_B.XPT', format='xport', encoding='utf-8')
bp_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/BPX_C.XPT', format='xport', encoding='utf-8')
bp_D = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/BPX_D.XPT', format='xport', encoding='utf-8')
bp_E = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/BPX_E.XPT', format='xport', encoding='utf-8')
bp_F = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/BPX_F.XPT', format='xport', encoding='utf-8')
bp_G = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/BPX_G.XPT', format='xport', encoding='utf-8')
bp_H = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/BPX_H.XPT', format='xport', encoding='utf-8')
bp_I = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/BPX_I.XPT', format='xport', encoding='utf-8')
bp_J = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/BPX_J.XPT', format='xport', encoding='utf-8')

In [62]:
bp_sys_B = bp_B[['SEQN', 'BPXSY1', 'BPXSY2', 'BPXSY3', 'BPXSY4']]
bp_sys_C = bp_C[['SEQN', 'BPXSY1', 'BPXSY2', 'BPXSY3', 'BPXSY4']]
bp_sys_D = bp_D[['SEQN', 'BPXSY1', 'BPXSY2', 'BPXSY3', 'BPXSY4']]
bp_sys_E = bp_E[['SEQN', 'BPXSY1', 'BPXSY2', 'BPXSY3', 'BPXSY4']]
bp_sys_F = bp_F[['SEQN', 'BPXSY1', 'BPXSY2', 'BPXSY3', 'BPXSY4']]
bp_sys_G = bp_G[['SEQN', 'BPXSY1', 'BPXSY2', 'BPXSY3', 'BPXSY4']]
bp_sys_H = bp_H[['SEQN', 'BPXSY1', 'BPXSY2', 'BPXSY3', 'BPXSY4']]
bp_sys_I = bp_I[['SEQN', 'BPXSY1', 'BPXSY2', 'BPXSY3', 'BPXSY4']]
bp_sys_J = bp_J[['SEQN', 'BPXSY1', 'BPXSY2', 'BPXSY3', 'BPXSY4']]

bp_sys = pd.concat([bp_sys_B, bp_sys_C, bp_sys_D, bp_sys_E, bp_sys_F, bp_sys_G, bp_sys_H, bp_sys_I, bp_sys_J])
bp_sys.set_index('SEQN', inplace=True)

In [63]:
bp_sys['sys_mean'] = np.nanmean(bp_sys, axis = 1)

  bp_sys['sys_mean'] = np.nanmean(bp_sys, axis = 1)


In [64]:
bp_sys = bp_sys.reset_index()[['SEQN', 'sys_mean']]

In [65]:
bp_sys['sys_ht'] = np.where(bp_sys['sys_mean'] >= 140, 'yes', 'no')

In [66]:
bp_sys.drop(columns='sys_mean', inplace=True)

In [67]:
adults = adults.merge(bp_sys, on='SEQN', how='left')

In [68]:
bp_di_B = bp_B[['SEQN', 'BPXDI1', 'BPXDI2', 'BPXDI3', 'BPXDI4']]
bp_di_C = bp_C[['SEQN', 'BPXDI1', 'BPXDI2', 'BPXDI3', 'BPXDI4']]
bp_di_D = bp_D[['SEQN', 'BPXDI1', 'BPXDI2', 'BPXDI3', 'BPXDI4']]
bp_di_E = bp_E[['SEQN', 'BPXDI1', 'BPXDI2', 'BPXDI3', 'BPXDI4']]
bp_di_F = bp_F[['SEQN', 'BPXDI1', 'BPXDI2', 'BPXDI3', 'BPXDI4']]
bp_di_G = bp_G[['SEQN', 'BPXDI1', 'BPXDI2', 'BPXDI3', 'BPXDI4']]
bp_di_H = bp_H[['SEQN', 'BPXDI1', 'BPXDI2', 'BPXDI3', 'BPXDI4']]
bp_di_I = bp_I[['SEQN', 'BPXDI1', 'BPXDI2', 'BPXDI3', 'BPXDI4']]
bp_di_J = bp_J[['SEQN', 'BPXDI1', 'BPXDI2', 'BPXDI3', 'BPXDI4']]

bp_di = pd.concat([bp_di_B, bp_di_C, bp_di_D, bp_di_E, bp_di_F, bp_di_G, bp_di_H, bp_di_I, bp_di_J])
bp_di.set_index('SEQN', inplace=True)

In [69]:
bp_di['di_mean'] = np.nanmean(bp_di, axis = 1)

  bp_di['di_mean'] = np.nanmean(bp_di, axis = 1)


In [70]:
bp_di = bp_di.reset_index()[['SEQN', 'di_mean']]

In [71]:
bp_di['di_ht'] = np.where(bp_di['di_mean'] >= 90, 'yes', 'no')

In [72]:
bp_di.drop(columns='di_mean', inplace=True)

In [73]:
adults = adults.merge(bp_di, on='SEQN', how='left')

In [74]:
#Questionnaire data - blood pressure (hypertension)

bpq_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/BPQ_B.XPT', format='xport', encoding='utf-8')
bpq_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/BPQ_C.XPT', format='xport', encoding='utf-8')
bpq_D = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/BPQ_D.XPT', format='xport', encoding='utf-8')
bpq_E = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/BPQ_E.XPT', format='xport', encoding='utf-8')
bpq_F = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/BPQ_F.XPT', format='xport', encoding='utf-8')
bpq_G = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/BPQ_G.XPT', format='xport', encoding='utf-8')
bpq_H = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/BPQ_H.XPT', format='xport', encoding='utf-8')
bpq_I = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/BPQ_I.XPT', format='xport', encoding='utf-8')
bpq_J = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/BPQ_J.XPT', format='xport', encoding='utf-8')

In [75]:
bpq_B = bpq_B[['SEQN', 'BPQ040A', 'BPQ050A']]
bpq_C = bpq_C[['SEQN', 'BPQ040A', 'BPQ050A']]
bpq_D = bpq_D[['SEQN', 'BPQ040A', 'BPQ050A']]
bpq_E = bpq_E[['SEQN', 'BPQ040A', 'BPQ050A']]
bpq_F = bpq_F[['SEQN', 'BPQ040A', 'BPQ050A']]
bpq_G = bpq_G[['SEQN', 'BPQ040A', 'BPQ050A']]
bpq_H = bpq_H[['SEQN', 'BPQ040A', 'BPQ050A']]
bpq_I = bpq_I[['SEQN', 'BPQ040A', 'BPQ050A']]
bpq_J = bpq_J[['SEQN', 'BPQ040A', 'BPQ050A']]

bpq = pd.concat([bpq_B, bpq_C, bpq_D, bpq_E, bpq_F, bpq_G, bpq_H, bpq_I, bpq_J])

In [76]:
adults = adults.merge(bpq, on='SEQN', how='left')

In [77]:
adults['hypertension'] = np.where(adults['sys_ht'] == 'yes', 'yes', 'no')
adults['hypertension'] = np.where(adults['di_ht'] == 'yes', 'yes', adults['hypertension'])
adults['hypertension'] = np.where(adults['BPQ040A'] == 1, 'yes', adults['hypertension'])
adults['hypertension'] = np.where(adults['BPQ050A'] == 1, 'yes', adults['hypertension'])
adults['hypertension'] = np.where(adults['BPQ040A'] == 7, 'unknown', adults['hypertension'])
adults['hypertension'] = np.where(adults['BPQ050A'] == 7, 'unknown', adults['hypertension'])
adults['hypertension'] = np.where(adults['BPQ040A'] == 9, 'unknown', adults['hypertension'])
adults['hypertension'] = np.where(adults['BPQ050A'] == 9, 'unknown', adults['hypertension'])

In [78]:
adults['hypertension'].value_counts()

no         1794341
yes        1004904
unknown       1160
Name: hypertension, dtype: int64

### Exclusion criteria - infectious diseases, CVD & Cancer

#### infectious diseases (Hepatitis / HIV)
* Hep A - see note
* Hep B
* Hep C
* Hep D
* HIV

Note on Hep A - Discussion with DL suggested use of elevated liver enzymes to detect true cases of active infection rather than ABs as a result of vaccination. However, there can be several causes of elevated enzymes aside from Hepatitis: "When Alanine transaminase (ALT) rises to more than 500 IU/L, causes are usually from the liver. It can be due to hepatitis, ischemic liver injury, and toxins that causes liver damage. The ALT levels in hepatitis C rises more than in hepatitis A and B. Persistent ALT elevation more than 6 months is known as chronic hepatitis. Alcoholic liver disease, non-alcoholic fatty liver disease (NAFLD), fat accumulation in liver during childhood obesity, steatohepatitis (inflammation of fatty liver disease) are associated with a rise in ALT." https://en.wikipedia.org/wiki/Liver_function_tests

For this reason, not current excluding individuals with positive result for Hep A.

In [79]:
#Lab data - Hepatitis B core antibody / surface antigen

hep_BCD_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/L02_B.XPT', format='xport', encoding='utf-8')
hep_BCD_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/L02_C.XPT', format='xport', encoding='utf-8')

hep_B_D = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/HEPBD_D.XPT', format='xport', encoding='utf-8')
hep_B_E = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/HEPBD_E.XPT', format='xport', encoding='utf-8')
hep_B_F = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/HEPBD_F.XPT', format='xport', encoding='utf-8')
hep_B_G = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/HEPBD_G.XPT', format='xport', encoding='utf-8')
hep_B_H = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/HEPBD_H.XPT', format='xport', encoding='utf-8')
hep_B_I = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/HEPBD_I.XPT', format='xport', encoding='utf-8')
hep_B_J = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/HEPBD_J.XPT', format='xport', encoding='utf-8')


#Hepatitis B surface antibody

hep_SA_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/L02HBS_B.XPT', format='xport', encoding='utf-8')
hep_SA_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/L02HBS_C.XPT', format='xport', encoding='utf-8')
hep_SA_D = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/HEPB_S_D.XPT', format='xport', encoding='utf-8')
hep_SA_E = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/HEPB_S_E.XPT', format='xport', encoding='utf-8')
hep_SA_F = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/HEPB_S_F.XPT', format='xport', encoding='utf-8')
hep_SA_G = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/HEPB_S_G.XPT', format='xport', encoding='utf-8')
hep_SA_H = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/HEPB_S_H.XPT', format='xport', encoding='utf-8')
hep_SA_I = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/HEPB_S_I.XPT', format='xport', encoding='utf-8')
hep_SA_J = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/HEPB_S_J.XPT', format='xport', encoding='utf-8')



In [80]:
hepB_B = hep_BCD_B[['SEQN', 'LBXHBC', 'LBDHBG']]
hepB_C = hep_BCD_C[['SEQN', 'LBXHBC', 'LBDHBG']]
hepB_D = hep_B_D[['SEQN', 'LBXHBC', 'LBDHBG']]
hepB_E = hep_B_E[['SEQN', 'LBXHBC', 'LBDHBG']]
hepB_F = hep_B_F[['SEQN', 'LBXHBC', 'LBDHBG']]
hepB_G = hep_B_G[['SEQN', 'LBXHBC', 'LBDHBG']]
hepB_H = hep_B_H[['SEQN', 'LBXHBC', 'LBDHBG']]
hepB_I = hep_B_I[['SEQN', 'LBXHBC', 'LBDHBG']]
hepB_J = hep_B_J[['SEQN', 'LBXHBC', 'LBDHBG']]

hepB_SA_B = hep_SA_B[['SEQN', 'LBXHBS']]
hepB_SA_C = hep_SA_C[['SEQN', 'LBXHBS']]
hepB_SA_D = hep_SA_D[['SEQN', 'LBXHBS']]
hepB_SA_E = hep_SA_E[['SEQN', 'LBXHBS']]
hepB_SA_F = hep_SA_F[['SEQN', 'LBXHBS']]
hepB_SA_G = hep_SA_G[['SEQN', 'LBXHBS']]
hepB_SA_H = hep_SA_H[['SEQN', 'LBXHBS']]
hepB_SA_I = hep_SA_I[['SEQN', 'LBXHBS']]
hepB_SA_J = hep_SA_J[['SEQN', 'LBXHBS']]

hep_B_CSA = pd.concat([hepB_B, hepB_C, hepB_D, hepB_E, hepB_F, hepB_G, hepB_H, hepB_I, hepB_J])
hep_B_SA = pd.concat([hepB_SA_B, hepB_SA_C, hepB_SA_D, hepB_SA_E, hepB_SA_F, hepB_SA_G, hepB_SA_H, hepB_SA_I, hepB_SA_J])

In [81]:
adults = adults.merge(hep_B_CSA, on='SEQN', how='left')
adults = adults.merge(hep_B_SA, on='SEQN', how='left')

In [82]:
adults['hep_B_infection'] = np.where(adults['LBXHBC'] == 1, 'yes', 'no')
adults['hep_B_infection'] = np.where(adults['LBDHBG'] == 1, 'yes', adults['hep_B_infection'])
adults['hep_B_infection'] = np.where(adults['LBXHBS'] == 1, 'no', adults['hep_B_infection']) # testing positive for surface antibody indicates immunity from vaccination, not active infection.

In [83]:
adults = adults[adults['hep_B_infection'] == 'no']

In [84]:
#Lab data - Hepatitis C / D

hepC_D = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/HEPC_D.XPT', format='xport', encoding='utf-8')
hepD_D = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/HEPBD_D.XPT', format='xport', encoding='utf-8')

hepC_E = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/HEPC_E.XPT', format='xport', encoding='utf-8')
hepD_E = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/HEPBD_E.XPT', format='xport', encoding='utf-8')

hepC_F = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/HEPC_F.XPT', format='xport', encoding='utf-8')
hepD_F = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/HEPBD_F.XPT', format='xport', encoding='utf-8')

hepC_G = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/HEPC_G.XPT', format='xport', encoding='utf-8')
hepD_G = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/HEPBD_G.XPT', format='xport', encoding='utf-8')

hepC_H = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/SSHEPC_H.XPT', format='xport', encoding='utf-8')
hepD_H = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/HEPBD_H.XPT', format='xport', encoding='utf-8')

hepC_I = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/HEPC_I.XPT', format='xport', encoding='utf-8')
hepD_I = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/HEPBD_I.XPT', format='xport', encoding='utf-8')

hepC_J = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/HEPC_J.XPT', format='xport', encoding='utf-8')
hepD_J = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/HEPBD_J.XPT', format='xport', encoding='utf-8')

In [85]:
hepC_B = hep_BCD_B[['SEQN', 'LBDHCV']]
hepC_C = hep_BCD_C[['SEQN', 'LBDHCV']]
hepC_D = hepC_D[['SEQN', 'LBDHCV']]
hepC_E = hepC_E[['SEQN', 'LBDHCV']]
hepC_F = hepC_F[['SEQN', 'LBDHCV']]
hepC_G = hepC_G[['SEQN', 'LBDHCV']]
hepC_H = hepC_H[['SEQN', 'LBDHCI']]
hepC_I = hepC_I[['SEQN', 'LBXHCR']] # discontinued testing kit, using 'LBXHCR' as assessment
hepC_J = hepC_J[['SEQN', 'LBDHCI']]

hepC_H.rename(columns={'LBDHCI':'LBDHCV'},inplace=True)
hepC_I.rename(columns={'LBXHCR':'LBDHCV'},inplace=True)
hepC_J.rename(columns={'LBDHCI':'LBDHCV'},inplace=True)

hepD_B = hep_BCD_B[['SEQN', 'LBDHD']]
hepD_C = hep_BCD_C[['SEQN', 'LBDHD']]
hepD_D = hepD_D[['SEQN', 'LBDHD']]
hepD_E = hepD_E[['SEQN', 'LBDHD']]
hepD_F = hepD_F[['SEQN', 'LBDHD']]
hepD_G = hepD_G[['SEQN', 'LBDHD']]
hepD_H = hepD_H[['SEQN', 'LBDHD']]
hepD_I = hepD_I[['SEQN', 'LBDHD']]
hepD_J = hepD_J[['SEQN', 'LBDHD']]

hep_C = pd.concat([hepC_B, hepC_C, hepC_D, hepC_E, hepC_F, hepC_G, hepC_H, hepC_I, hepC_J])
hep_D = pd.concat([hepD_B, hepD_C, hepD_D, hepD_E, hepD_F, hepD_G, hepD_H, hepD_I, hepD_J])

In [86]:
adults = adults.merge(hep_C, on='SEQN', how='left')
adults = adults.merge(hep_D, on='SEQN', how='left')

In [87]:
adults['hep_C_infection'] = np.where(adults['LBDHCV'] == 1, 'yes', 'no')
adults['hep_D_infection'] = np.where(adults['LBDHD'] == 1, 'yes', 'no')

In [88]:
adults = adults[adults['hep_C_infection'] == 'no']
adults = adults[adults['hep_D_infection'] == 'no']

In [89]:
#Lab data - HIV

hiv_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/L03_B.XPT', format='xport', encoding='utf-8')
hiv_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/L03_C.XPT', format='xport', encoding='utf-8')
hiv_D = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/hiv_D.XPT', format='xport', encoding='utf-8')
hiv_E = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/hiv_E.XPT', format='xport', encoding='utf-8')
hiv_F = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/hiv_F.XPT', format='xport', encoding='utf-8')
hiv_G = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/hiv_G.XPT', format='xport', encoding='utf-8')
hiv_H = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/hiv_H.XPT', format='xport', encoding='utf-8')
hiv_I = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/hiv_I.XPT', format='xport', encoding='utf-8')
hiv_J = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/hiv_J.XPT', format='xport', encoding='utf-8')

In [90]:
hiv_I = hiv_I[hiv_I['LBXHIVC'] == 1]
hiv_I = hiv_I[hiv_I['LBXHNAT'] != 2]
hiv_I.rename(columns={'LBXHIVC':'LBDHI'}, inplace=True)
hiv_I = hiv_I[['SEQN', 'LBDHI']]

In [91]:
hiv_J = hiv_J[hiv_J['LBXHIVC'] == 1]
hiv_J = hiv_J[hiv_J['LBXHNAT'] != 2]
hiv_J.rename(columns={'LBXHIVC':'LBDHI'}, inplace=True)
hiv_J = hiv_J[['SEQN', 'LBDHI']]

In [92]:
hiv = pd.concat([hiv_B, hiv_C, hiv_D, hiv_E, hiv_F, hiv_G, hiv_H, hiv_I, hiv_J])

In [93]:
hiv_pos = hiv[hiv['LBDHI']==1]

In [94]:
adults = adults[~adults['SEQN'].isin(hiv_pos['SEQN'])]

In [95]:
#Questionnaire data - CVD / Cancer

mc_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT', format='xport', encoding='utf-8')
mc_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/MCQ_C.XPT', format='xport', encoding='utf-8')
mc_D = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/MCQ_D.XPT', format='xport', encoding='utf-8')
mc_E = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/MCQ_E.XPT', format='xport', encoding='utf-8')
mc_F = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/MCQ_F.XPT', format='xport', encoding='utf-8')
mc_G = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/MCQ_G.XPT', format='xport', encoding='utf-8')
mc_H = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/MCQ_H.XPT', format='xport', encoding='utf-8')
mc_I = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/MCQ_I.XPT', format='xport', encoding='utf-8')
mc_J = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/MCQ_J.XPT', format='xport', encoding='utf-8')

  mc_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT', format='xport', encoding='utf-8')
  mc_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT', format='xport', encoding='utf-8')
  mc_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT', format='xport', encoding='utf-8')
  mc_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT', format='xport', encoding='utf-8')
  mc_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT', format='xport', encoding='utf-8')
  mc_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT', format='xport', encoding='utf-8')
  mc_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT', format='xport', encoding='utf-8')
  mc_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT', format='xport', encoding='utf-8')
  mc_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT', format='xport', encoding='utf-8')
 

  mc_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/MCQ_C.XPT', format='xport', encoding='utf-8')
  mc_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/MCQ_C.XPT', format='xport', encoding='utf-8')
  mc_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/MCQ_C.XPT', format='xport', encoding='utf-8')
  mc_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/MCQ_C.XPT', format='xport', encoding='utf-8')
  mc_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/MCQ_C.XPT', format='xport', encoding='utf-8')
  mc_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/MCQ_C.XPT', format='xport', encoding='utf-8')
  mc_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/MCQ_C.XPT', format='xport', encoding='utf-8')
  mc_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/MCQ_C.XPT', format='xport', encoding='utf-8')
  mc_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/MCQ_C.XPT', format='xport', encoding='utf-8')
 

In [96]:
mc_B = mc_B[['SEQN', 'MCQ180C', 'MCQ220']]
mc_C = mc_C[['SEQN', 'MCQ180C', 'MCQ220']]
mc_D = mc_D[['SEQN', 'MCQ180C', 'MCQ220']]
mc_E = mc_E[['SEQN', 'MCQ180C', 'MCQ220']]
mc_F = mc_F[['SEQN', 'MCQ180C', 'MCQ220']]
mc_G = mc_G[['SEQN', 'MCQ180C', 'MCQ220']]
mc_H = mc_H[['SEQN', 'MCQ180C', 'MCQ220']]
mc_I = mc_I[['SEQN', 'MCQ180C', 'MCQ220']]
mc_J = mc_J[['SEQN', 'MCQ160C', 'MCQ220']]
mc_J.rename(columns={'MCQ160C':'MCQ180C'}, inplace=True)

mc = pd.concat([mc_B, mc_C, mc_D, mc_E, mc_F, mc_G, mc_H, mc_I, mc_J])

In [97]:
adults = adults.merge(mc, on='SEQN', how='left')

In [98]:
adults['MCQ180C'] = np.where(adults['MCQ180C'] < 85, 'yes', 'no')
adults['MCQ220'] = np.where(adults['MCQ220'] == 1, 'yes', 'no')

In [99]:
adults = adults[adults['MCQ180C'] != 'yes']
adults = adults[adults['MCQ220'] != 'yes']

### C-reactive protein

In [100]:
#Lab data - CRP (No CRP data collected from 2011-2014)

crp_B = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/L11_B.XPT', format='xport', encoding='utf-8')
crp_C = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/L11_C.XPT', format='xport', encoding='utf-8')
crp_D = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/CRP_D.XPT', format='xport', encoding='utf-8')
crp_E = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/CRP_E.XPT', format='xport', encoding='utf-8')
crp_F = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/CRP_F.XPT', format='xport', encoding='utf-8')
crp_I = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/HSCRP_I.XPT', format='xport', encoding='utf-8')
crp_J = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/HSCRP_J.XPT', format='xport', encoding='utf-8')

In [101]:
crp_I.rename(columns={'LBXHSCRP':'LBXCRP'}, inplace=True)
crp_J.rename(columns={'LBXHSCRP':'LBXCRP'}, inplace=True)

crp_B = crp_B[['SEQN', 'LBXCRP']]
crp_C = crp_C[['SEQN', 'LBXCRP']]
crp_D = crp_D[['SEQN', 'LBXCRP']]
crp_E = crp_E[['SEQN', 'LBXCRP']]
crp_F = crp_F[['SEQN', 'LBXCRP']]
crp_I = crp_I[['SEQN', 'LBXCRP']]
crp_J = crp_J[['SEQN', 'LBXCRP']]

In [102]:
crp_convert = pd.concat([crp_I, crp_J])
crp_convert['LBXCRP'] = crp_convert['LBXCRP'] / 10 # convert from mg/L to mg/dL

crp = pd.concat([crp_convert, crp_B, crp_C, crp_D, crp_E, crp_F])

In [103]:
adults = adults.merge(crp, on='SEQN', how='left')

In [104]:
adults_crp = adults.dropna(subset='LBXCRP')

In [105]:
# exclude those with CRP > 10 mg/L
adults_crp = adults_crp[adults_crp['LBXCRP'] <= 10]

In [106]:
adults.SEQN.nunique() - adults_crp.SEQN.nunique()

10223

In [139]:
design = adults[['SEQN','WTMEC2YR', 'SDMVPSU', 'SDMVSTRA']].drop_duplicates('SEQN')

In [140]:
design['mecwts'] = design['WTMEC2YR'] / 7 # calculate sample weights based on number of included cycles

In [107]:
adults_crp.rename(columns={'RIDRETH1':'Ethnicity', 'INDFMPIR': 'family_pir', 'LBXCRP': 'crp'},inplace=True)

In [108]:
# reduce correlated features
adults_crp['vitamin_b12_total'] = adults_crp['Vitamin B-12'] + adults_crp['Vitamin B-12, added']
adults_crp['vitamin_e_total'] = adults_crp['Vitamin E (alpha-tocopherol)'] + adults_crp['Vitamin E, added'] # reduce correlated features

In [109]:
adults_crp = adults_crp[['SEQN', 'ingred_code', 'ingred_desc', 'Ingred_consumed_g', 'Sex', 'Age', 'body_wt',
       'Ethnicity', 'family_pir', 'education', 'BMI', 'Capric acid', 'Lauric acid', 'Myristic acid', 'Palmitic acid', 'Palmitoleic acid',
                         'Stearic acid', 'Oleic acid', 'Linoleic acid', 'Linolenic acid', 'Stearidonic acid',
                         'Eicosenoic acid', 'Arachidonic acid', 'Eicosapentaenoic acid', 'Erucic acid',
                         'Docosapentaenoic acid', 'Docosahexaenoic acid', 'Butyric acid', 'Caproic acid',
                         'Caprylic acid', 'Alcohol', 'Caffeine', 'Calcium', 'Carbohydrate', 'Carotene, alpha',
                         'Carotene, beta', 'Cholesterol', 'Choline, total', 'Copper', 'Cryptoxanthin, beta',
                         'Energy', 'Fatty acids, total monounsaturated', 'Fatty acids, total polyunsaturated',
                         'Fatty acids, total saturated', 'Fiber, total dietary', 'Folate, DFE','Iron',
                         'Lutein + zeaxanthin', 'Lycopene', 'Magnesium', 'Niacin', 'Phosphorus', 'Potassium',
                         'Protein', 'Retinol', 'Riboflavin', 'Selenium', 'Sodium', 'Sugars, total', 'Theobromine',
                         'Thiamin', 'Total Fat', 'Vitamin A, RAE', 'vitamin_b12_total', 'Vitamin B-6', 'Vitamin C',
                         'Vitamin D (D2 + D3)', 'vitamin_e_total', 'Vitamin K (phylloquinone)', 'Water', 'Zinc',
       'ever_smoker', 'diabetes', 'hypertension', 'crp', 'WTINT2YR', 'WTMEC2YR', 'SDMVPSU',
       'SDMVSTRA', 'CYCLE', 'diet_wts']]

In [110]:
adults_crp['diet_wts'] = adults_crp['diet_wts'] / 7 # calculate sample weights based on number of included cycles

In [111]:
energy = adults_crp.groupby('SEQN')['Energy'].agg(np.sum)

In [112]:
energy = energy.reset_index()

In [113]:
energy = energy[energy['Energy']>500]

In [114]:
energy = energy[energy['Energy']<4500]

In [115]:
e_cent = energy[energy['Energy']> np.percentile(energy['Energy'], 5)] 

In [116]:
e_cent = e_cent[e_cent['Energy']< np.percentile(energy['Energy'], 95)] 

In [117]:
adults_qc = adults_crp[adults_crp['SEQN'].isin(e_cent['SEQN'])] 

In [118]:
taxa_hfe = adults_qc.drop_duplicates(subset='SEQN')

In [119]:
taxa_hfe = taxa_hfe[['SEQN', 'crp', 'Age', 'Sex', 'education', 'Ethnicity', 'family_pir', 'BMI', 'ever_smoker', 'diabetes', 'hypertension', 'diet_wts']]

In [121]:
taxa_hfe.isna().sum()

SEQN               0
crp                0
Age                0
Sex                0
education          0
Ethnicity          0
family_pir      1635
BMI                0
ever_smoker     1643
diabetes           0
hypertension       0
diet_wts           0
dtype: int64

In [122]:
# remove particpiants with missing data, and those with 'unknown' for covariates
taxa_hfe = taxa_hfe.dropna()
taxa_hfe = taxa_hfe[taxa_hfe['education']!='unknown']
taxa_hfe = taxa_hfe[taxa_hfe['ever_smoker']!='unknown']
taxa_hfe = taxa_hfe[taxa_hfe['diabetes']!='unknown']
taxa_hfe = taxa_hfe[taxa_hfe['hypertension']!='unknown']

In [145]:
crp_target = taxa_hfe[['SEQN', 'crp']]

In [146]:
# create feature for tertiles of crp
tertiles = crp_target['crp'].quantile([1/3, 2/3]).tolist()
tertiles = [0] + tertiles + [float('inf')]
crp_target['crp_class'] = pd.cut(crp_target['crp'], bins=tertiles, labels=[0, 9, 1]) # mid tertile = 9, remove
crp_target['crp_table'] = pd.cut(crp_target['crp'], bins=tertiles, labels=[1, 2, 3]) # mid tertile = 9, remove

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crp_target['crp_class'] = pd.cut(crp_target['crp'], bins=tertiles, labels=[0, 9, 1]) # mid tertile = 9, remove
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crp_target['crp_table'] = pd.cut(crp_target['crp'], bins=tertiles, labels=[1, 2, 3]) # mid tertile = 9, remove


In [147]:
design_crp = crp_target.merge(design, on='SEQN', how='left')

In [150]:
design_crp[['SEQN', 'crp_table', 'SDMVPSU', 'SDMVSTRA', 'mecwts']].to_csv('../../data/00/crp_table1.csv',index=None)

In [130]:
crp_target = crp_target[crp_target['crp_class']!=9]
crp_target[['SEQN', 'crp_class']].to_csv('../../data/00/crp_class_target.csv',index=None)

In [123]:
taxa_hfe[['SEQN', 'diet_wts']].to_csv('../../data/00/wweia_wts.csv', index=None)

In [124]:
taxa_hfe.drop(columns=['crp', 'diet_wts'], inplace=True)
taxa_hfe.to_csv('../../data/00/wweia_covariates.csv', index=None)

In [125]:
adults_qc = adults_qc[adults_qc['SEQN'].isin(taxa_hfe['SEQN'])]

In [126]:
adults_qc.ingred_desc.value_counts().to_csv('../../data/00/ingredient_counts.csv')

In [127]:
adults_qc.to_csv('../../data/00/wweia_qc_crp.txt', sep='\t', index=None)

In [128]:
# create unique ingredient code file for food tree
wweia_unique = adults_qc[['ingred_code', 'ingred_desc']]
wweia_unique.drop_duplicates(subset='ingred_code').to_csv('../../data/00/wweia_crp_unique_ingred_codes.csv', index=None)