In [None]:
import os

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score, precision_score, recall_score,
    average_precision_score, confusion_matrix
)

# 1. Setup Data Paths
nhanes_step_count_dir = "./data/nhanes-step-count/"
subject_info_path = os.path.join(nhanes_step_count_dir, "subject-info.csv")
actisteps_path = os.path.join(nhanes_step_count_dir, "nhanes_1440_actisteps.csv.xz")
ac_path = os.path.join(nhanes_step_count_dir, "nhanes_1440_AC.csv.xz")
mims_path = os.path.join(nhanes_step_count_dir, "nhanes_1440_PAXMTSM.csv.xz")

nhanes_lab_dir = "./data/nhanes-lab/"
ghb_path_2011 = os.path.join(nhanes_lab_dir, "ghb-2011-12.xpt")
ghb_path_2013 = os.path.join(nhanes_lab_dir, "ghb-2013-14.xpt")

nhanes_questionnaire_dir = "./data/nhanes-questionnaire/"
bpq_path_2011 = os.path.join(nhanes_questionnaire_dir, "bpq-2011-12.xpt")
bpq_path_2013 = os.path.join(nhanes_questionnaire_dir, "bpq-2013-14.xpt")

In [47]:
import pandas as pd

# 2. Load NHANES Step-Count Data
print("Loading Wearable Movement Data... (this may take a few minutes)")

subj_df = pd.read_csv(subject_info_path, dtype={"SEQN": "Int64"})
actisteps_df = pd.read_csv(actisteps_path, dtype={"SEQN": "Int64"}, low_memory=False)
ac_df = pd.read_csv(ac_path, dtype={"SEQN": "Int64"}, low_memory=False)
mims_df = pd.read_csv(mims_path, dtype={"SEQN": "Int64"}, low_memory=False)

print("Wearable Data Shape:")
print("- Subject Info:", subj_df.shape)
print("- Actisteps:", actisteps_df.shape)
print("- Activity Counts:", ac_df.shape)
print("- MIMS:", mims_df.shape)

Loading Wearable Movement Data... (this may take a few minutes)
Wearable Data Shape:
- Subject Info: (19931, 8)
- Actisteps: (130186, 1443)
- Activity Counts: (130186, 1443)
- MIMS: (130186, 1443)
Wearable Data Shape:
- Subject Info: (19931, 8)
- Actisteps: (130186, 1443)
- Activity Counts: (130186, 1443)
- MIMS: (130186, 1443)


In [48]:
# 3. Feature Engineering for NHANES Step-Count Data
print("Computing Wearable Data Features...")

# Compute Daily Step Statistics
actisteps_minute_cols = [c for c in actisteps_df.columns if c.startswith("min_")]
actisteps_df["daily_steps"] = actisteps_df[actisteps_minute_cols].sum(axis=1, numeric_only=True)
actisteps_df["valid_day"] = actisteps_df["daily_steps"] > 0
actisteps_df = actisteps_df[actisteps_df["valid_day"]]
actisteps_agg = actisteps_df.groupby("SEQN").agg(
    mean_daily_steps=("daily_steps", "mean"),
    sd_daily_steps=("daily_steps", "std"),
).reset_index()
actisteps_agg["sd_daily_steps"] = actisteps_agg["sd_daily_steps"].fillna(0.0)

selected_columns = ["SEQN", "mean_daily_steps", "sd_daily_steps"]
df = actisteps_agg[selected_columns].copy()

# Compute Activity Counts Features
ac_minute_cols = [c for c in ac_df.columns if c.startswith("min_")]
ac_df["daily_AC"] = ac_df[ac_minute_cols].sum(axis=1, numeric_only=True)
ac_agg = ac_df.groupby("SEQN").agg(
    mean_daily_AC=("daily_AC", "mean"),
    sd_daily_AC=("daily_AC", "std")
).reset_index()
ac_agg["sd_daily_AC"] = ac_agg["sd_daily_AC"].fillna(0.0)

selected_columns = ["SEQN", "mean_daily_AC", "sd_daily_AC"]
ac_agg = ac_agg[selected_columns].copy()
df = df.merge(ac_agg, on="SEQN", how="left")

# Compute MIMS (Monitor-Independent Movement Summary) Features
mims_minute_cols = [c for c in mims_df.columns if c.startswith("min_")]
mims_df["daily_mims_sum"] = mims_df[mims_minute_cols].sum(axis=1, numeric_only=True)
mims_agg = mims_df.groupby("SEQN").agg(
    mean_daily_mims=("daily_mims_sum", "mean"),
).reset_index()
selected_columns = ["SEQN", "mean_daily_mims"]
mims_agg = mims_agg[selected_columns].copy()
df = df.merge(mims_agg, on="SEQN", how="left")

# Merge with Subject Info
print(subj_df.columns.tolist())
selected_columns = ["SEQN", "gender", "age_in_years_at_screening"]
subj_df = subj_df[selected_columns].copy()
subj_df['gender'] = subj_df['gender'].map({'Male': 0, 'Female': 1})
subj_df = subj_df.rename(columns={'age_in_years_at_screening': 'age'})
df = df.merge(subj_df, on="SEQN", how="left")

df.describe()

Computing Wearable Data Features...
['SEQN', 'data_release_cycle', 'gender', 'age_in_years_at_screening', 'full_sample_2_year_interview_weight', 'full_sample_2_year_mec_exam_weight', 'masked_variance_pseudo_psu', 'masked_variance_pseudo_stratum']
['SEQN', 'data_release_cycle', 'gender', 'age_in_years_at_screening', 'full_sample_2_year_interview_weight', 'full_sample_2_year_mec_exam_weight', 'masked_variance_pseudo_psu', 'masked_variance_pseudo_stratum']


Unnamed: 0,SEQN,mean_daily_steps,sd_daily_steps,mean_daily_AC,sd_daily_AC,mean_daily_mims,gender,age
count,14685.0,14685.0,14685.0,14685.0,14685.0,14685.0,14685.0,14685.0
mean,73183.777528,9696.561159,4827.944617,2151068.0,1063271.0,11352.391422,0.510861,35.753899
std,6486.391406,3703.772504,1902.143937,883331.1,443527.9,4329.524632,0.499899,23.184655
min,62161.0,1.5,0.0,125.6192,0.0,1.344667,0.0,3.0
25%,67313.0,7313.111111,3593.25027,1571732.0,757507.7,8623.339,0.0,14.0
50%,74099.0,9805.111111,4756.055824,2129310.0,1028943.0,11335.417778,1.0,33.0
75%,78971.0,12158.666667,5954.645665,2734121.0,1333755.0,14213.382,1.0,55.0
max,83731.0,29042.222222,18750.860952,7206275.0,7410255.0,35693.532444,1.0,80.0


In [None]:
# 3.5 - Load NHANES Blood Pressure Questionnaire Data + Merge Features

bpq_2011 = pd.read_sas(bpq_path_2011, format="xport")
bpq_2013 = pd.read_sas(bpq_path_2013, format="xport")

bpq = pd.concat([bpq_2011, bpq_2013], ignore_index=True)
# BPQ020: Ever told you had high blood pressure
# BPQ080: Doctor told you - high cholesterol level
selected_columns = ["SEQN", "BPQ020", "BPQ080"]
bpq = bpq[selected_columns].copy()
bpq = bpq.dropna()
df = df.merge(bpq, on="SEQN", how="inner")

df.describe()

FileNotFoundError: [Errno 2] No such file or directory: './data/questionnaire/BPQ-2013-14.xpt'

In [None]:
#4 Load NHANES Laboratory Glycohemoglobin Data + Calculate Ground Truth Diabetes Binary
print("Loading Lab A1C Data...")

ghb_2011 = pd.read_sas("./data/nhanes-lab/ghb-2011-12.xpt", format="xport")
ghb_2013 = pd.read_sas("./data/nhanes-lab/ghb-2013-14.xpt", format="xport")

print("Lab Data Shape:")
print("- GHB 2011-2012:", ghb_2011.shape)
print("- GHB 2013-2014:", ghb_2013.shape)

print("Calculating ground truth with A1C lab data...")
ghb_2011 = ghb_2011[["SEQN", "LBXGH"]].rename(columns={"LBXGH": "a1c_2011"})
ghb_2013 = ghb_2013[["SEQN", "LBXGH"]].rename(columns={"LBXGH": "a1c_2013"})

# Combine 2011-12 and 2013-14 data
ghb = pd.concat([ghb_2011, ghb_2013], ignore_index=True)
ghb["a1c"] = ghb["a1c_2011"].combine_first(ghb["a1c_2013"])
ghb = ghb.dropna(subset=["a1c"])
ghb = ghb.drop_duplicates(subset=["SEQN"], keep="first")

# A1C Diabetes Criteria (We include prediabetes as diabetes):
#   normal < 5.7
#   prediabetes 5.7-6.4
#   diabetes >= 6.5
ghb["diabetes_binary"] = (ghb["a1c"] >= 5.7).astype(int)

selected_columns = ["SEQN", "diabetes_binary"]
ghb = ghb[selected_columns].copy()
df = df.merge(ghb, on="SEQN", how="inner")

df.describe()

Loading Lab A1C Data...
Lab Data Shape:
- GHB 2011-2012: (6549, 2)
- GHB 2013-2014: (6979, 2)
Calculating ground truth with A1C lab data...


Unnamed: 0,SEQN,mean_daily_steps,sd_daily_steps,mean_daily_AC,sd_daily_AC,mean_daily_mims,gender,age,diabetes_binary
count,11297.0,11297.0,11297.0,11297.0,11297.0,11297.0,11297.0,11297.0,11297.0
mean,73044.197575,9353.983363,4678.675234,1988807.0,980710.7,10539.750274,0.515978,42.759494,0.093742
std,6492.449564,3656.954725,1938.013479,776447.5,403082.7,3757.227917,0.499767,20.607693,0.291482
min,62161.0,3.0,0.0,186.2902,0.0,1.344667,0.0,12.0,0.0
25%,67229.0,6989.555556,3390.468444,1503062.0,712465.6,8281.954111,0.0,24.0,0.0
50%,73847.0,9287.222222,4526.853985,1984948.0,949931.1,10607.631222,1.0,42.0,0.0
75%,78922.0,11617.666667,5782.260395,2486553.0,1213270.0,12980.228778,1.0,60.0,0.0
max,83729.0,29042.222222,18750.860952,7206275.0,7410255.0,35693.532444,1.0,80.0,1.0


In [50]:
# 5 Split Data into Train/Val/Test:
print("Splitting training for train/val/test...")

X = df.drop(columns=["SEQN", "diabetes_binary"])
y = df["diabetes_binary"].astype(int)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print("Split sizes (train/val/test):", len(X_train), len(X_val), len(X_test))

Splitting training for train/val/test...
Split sizes (train/val/test): 7907 1695 1695
Split sizes (train/val/test): 7907 1695 1695


In [51]:
# 6. Train Random Forest
# -----------------------------
model = RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=42)
model.fit(X_train, y_train)

# choose threshold by maximizing F1 on validation
y_val_prob = model.predict_proba(X_val)[:, 1]
best_t, best_f1 = 0.0, -1.0
for t in np.linspace(0.05, 0.95, 200):
    preds = (y_val_prob >= t).astype(int)
    score = f1_score(y_val, preds)
    if score > best_f1:
        best_f1 = score
        best_t = t
print("Best validation threshold:", best_t, "F1:", best_f1)

# lower threshold to boost recall
adjusted_t = best_t * 0.8
print("Adjusted (recall-boosted) threshold:", adjusted_t)

y_test_prob = model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_prob >= adjusted_t).astype(int)

print("Test Precision:", precision_score(y_test, y_test_pred))
print("Test Recall:", recall_score(y_test, y_test_pred))
print("Test F1:", f1_score(y_test, y_test_pred))
print("Test PR AUC:", average_precision_score(y_test, y_test_prob))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

Best validation threshold: 0.14045226130653266 F1: 0.27938671209540034
Adjusted (recall-boosted) threshold: 0.11236180904522614
Test Precision: 0.18363636363636363
Test Recall: 0.6352201257861635
Test F1: 0.2849083215796897
Test PR AUC: 0.22204354827581907
Confusion Matrix:
 [[1087  449]
 [  58  101]]
