In [2]:
# run with: python nhanes_wearable_diabetes.py
# required: pandas, numpy, scikit-learn, wget (or run wget from shell)
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score, precision_score, recall_score,
    average_precision_score, confusion_matrix
)

# -----------------------------
# 0. Import PhysioNet package (one-time)
# -----------------------------
# If you want to download via command-line from the PhysioNet host, run:
# wget -r -N -c -np https://physionet.org/files/minute-level-step-count-nhanes/1.0.1/

# -----------------------------
# 1. Paths (update if you put files elsewhere)
# -----------------------------
base_dir = "./data/nhanes-step-count/"   # set to where you put the PhysioNet folder
subject_info_path = os.path.join(base_dir, "subject-info.csv")
actisteps_path = os.path.join(base_dir, "nhanes_1440_actisteps.csv.xz")
ac_path = os.path.join(base_dir, "nhanes_1440_AC.csv.xz")          # activity counts (if present)
mims_path = os.path.join(base_dir, "nhanes_1440_PAXMTSM.csv.xz")  # MIMS summary (if present)

# sanity check files exist
for p in [subject_info_path, actisteps_path]:
    if not os.path.exists(p):
        raise FileNotFoundError(f"Required file not found: {p}\nPlease download the PhysioNet package and set base_dir correctly.")

In [3]:

# -----------------------------
# 2. Load subject info (demographics)
# -----------------------------
print("Loading subject-info.csv ...")
subj = pd.read_csv(subject_info_path, dtype={"SEQN": str})
# keep SEQN as string for consistent merging
subj["SEQN"] = subj["SEQN"].astype(str)
print("subject-info rows:", len(subj))
print(subj.columns.tolist())


Loading subject-info.csv ...
subject-info rows: 19931
['SEQN', 'data_release_cycle', 'gender', 'age_in_years_at_screening', 'full_sample_2_year_interview_weight', 'full_sample_2_year_mec_exam_weight', 'masked_variance_pseudo_psu', 'masked_variance_pseudo_stratum']


In [4]:
# -----------------------------
# 3. Load daily actisteps file and compute per-day totals -> per-subject features
# -----------------------------
# The actisteps file contains one row per subject-day with columns: SEQN, PAXDAYM, PAXDAYWM, min_0001 ... min_1440
# We'll compute daily_sum = sum(min_0001...min_1440), mark valid day (daily_sum > 0), then aggregate per SEQN.
print("Loading daily actisteps (compressed)... This may take a few minutes.")
df_actisteps = pd.read_csv(actisteps_path, dtype={"SEQN": str}, low_memory=False)
# Identify minute columns (min_0001 ... min_1440)
minute_cols = [c for c in df_actisteps.columns if c.startswith("min_")]
print("Found minute columns:", len(minute_cols))

# compute daily totals (sum across minutes)
df_actisteps["daily_steps"] = df_actisteps[minute_cols].sum(axis=1, numeric_only=True)

# create a 'valid_day' mask: require daily_steps > 0 and not flagged as missing/quality flag if available
# (PAXFLGSM or similar flags may exist in other files; for simplicity use daily_steps>0)
df_actisteps["valid_day"] = df_actisteps["daily_steps"] > 0

# aggregate per subject
agg = df_actisteps.groupby("SEQN").agg(
    mean_daily_steps=("daily_steps", "mean"),
    median_daily_steps=("daily_steps", "median"),
    sd_daily_steps=("daily_steps", "std"),
    valid_day_count=("valid_day", "sum"),
    total_days=("daily_steps", "size")
).reset_index()

# if sd is NaN (single day), fill with 0
agg["sd_daily_steps"] = agg["sd_daily_steps"].fillna(0.0)
agg["pct_valid_days"] = agg["valid_day_count"] / agg["total_days"]

print("Per-subject aggregated step features:", agg.shape)

Loading daily actisteps (compressed)... This may take a few minutes.
Found minute columns: 1440
Per-subject aggregated step features: (14693, 7)


In [5]:
# -----------------------------
# 4. Optional: load AC / MIMS files (if you want activity counts / MIMS)
# -----------------------------
if os.path.exists(ac_path):
    print("Loading AC (activity counts) file to compute mean AC per day...")
    df_ac = pd.read_csv(ac_path, dtype={"SEQN": str}, low_memory=False)
    # AC file rows are daily with min_ columns again
    min_cols_ac = [c for c in df_ac.columns if c.startswith("min_")]
    df_ac["daily_AC_sum"] = df_ac[min_cols_ac].sum(axis=1, numeric_only=True)
    ac_agg = df_ac.groupby("SEQN").agg(
        mean_daily_AC=("daily_AC_sum", "mean"),
        sd_daily_AC=("daily_AC_sum", "std")
    ).reset_index()
    ac_agg["sd_daily_AC"] = ac_agg["sd_daily_AC"].fillna(0.0)
    # merge into agg
    agg = agg.merge(ac_agg, on="SEQN", how="left")

if os.path.exists(mims_path):
    print("Loading MIMS file to get mean MIMS per day...")
    df_mims = pd.read_csv(mims_path, dtype={"SEQN": str}, low_memory=False)
    # PAXMTSM daily summary (min_ cols are MIMS per minute)
    min_cols_mims = [c for c in df_mims.columns if c.startswith("min_")]
    df_mims["daily_mims_sum"] = df_mims[min_cols_mims].sum(axis=1, numeric_only=True)
    mims_agg = df_mims.groupby("SEQN").agg(
        mean_daily_mims=("daily_mims_sum", "mean")
    ).reset_index()
    agg = agg.merge(mims_agg, on="SEQN", how="left")

Loading AC (activity counts) file to compute mean AC per day...
Loading MIMS file to get mean MIMS per day...


In [6]:
# -----------------------------
# 5. Merge with subject-info to bring in demographics (Age, Sex, BMI if present)
# -----------------------------
# subject-info columns differ; common ones include AGE, Sex, etc. Inspect and pick.
print("Merging aggregated wearable features with subject-info...")
df = agg.merge(subj, on="SEQN", how="left")

# rename some demographic columns if needed (inspect)
print("Merged shape:", df.shape)
print("Sample columns:", df.columns.tolist()[:20])

Merging aggregated wearable features with subject-info...
Merged shape: (14693, 17)
Sample columns: ['SEQN', 'mean_daily_steps', 'median_daily_steps', 'sd_daily_steps', 'valid_day_count', 'total_days', 'pct_valid_days', 'mean_daily_AC', 'sd_daily_AC', 'mean_daily_mims', 'data_release_cycle', 'gender', 'age_in_years_at_screening', 'full_sample_2_year_interview_weight', 'full_sample_2_year_mec_exam_weight', 'masked_variance_pseudo_psu', 'masked_variance_pseudo_stratum']


In [7]:
# -----------------------------
# 6. Load NHANES Glycohemoglobin files (2011-2012 and 2013-2014) and merge LBXGH
# -----------------------------

print("Reading NHANES glycohemoglobin (HbA1c) files from CDC...")
try:
    ghb_g = pd.read_sas("./data/nhanes-lab/ghb-2011-12.xpt", format="xport")
    ghb_h = pd.read_sas("./data/nhanes-lab/ghb-2013-14.xpt", format="xport")
except Exception as e:
    raise RuntimeError("Could not read GHB XPT files via pandas.read_sas. "
                       "You can manually download the XPTs from the CDC NHANES pages and point ghb_g_path to local file.") from e

# standardize LBXGH column name (NHANES uses LBXGH for HbA1c %)
ghb_g = ghb_g[["SEQN", "LBXGH"]].rename(columns={"LBXGH": "LBXGH_G"})
ghb_h = ghb_h[["SEQN", "LBXGH"]].rename(columns={"LBXGH": "LBXGH_H"})

# combine rows: prefer G (2011-2012) then H (2013-2014) depending on SEQN (SEQN is unique per cycle),
# but simplest is concatenate then dedupe latest if duplicated.
ghb = pd.concat([ghb_g, ghb_h], ignore_index=True)
print("Columns in GHB file:", ghb.columns.tolist())
ghb["HBA1C"] = ghb["LBXGH_G"].fillna(ghb["LBXGH_H"])
ghb = ghb.dropna(subset=["HBA1C"])
ghb = ghb.drop_duplicates(subset=["SEQN"], keep="first")

print("Glycohemoglobin rows after concat:", len(ghb))

Reading NHANES glycohemoglobin (HbA1c) files from CDC...
Columns in GHB file: ['SEQN', 'LBXGH_G', 'LBXGH_H']
Glycohemoglobin rows after concat: 12788


In [8]:
# -----------------------------
# 7. Merge wearable features with glycohemoglobin (on SEQN) to make modeling table
# -----------------------------
df["SEQN"] = pd.to_numeric(df["SEQN"], errors="coerce").astype("Int64")
ghb["SEQN"] = pd.to_numeric(ghb["SEQN"], errors="coerce").astype("Int64")
df = df.merge(ghb, on="SEQN", how="inner")
print("After merging with HbA1c, shape:", df.shape)

# create diabetes label (0/1). Option: include prediabetes as 1 if you want
# Standard cutoffs:
#   normal < 5.7
#   prediabetes 5.7-6.4
#   diabetes >= 6.5
df["diabetes_binary"] = (df["HBA1C"] >= 6.5).astype(int)
# if LBXGH_G/H column names differ, adjust above logic

print("Final dataset for modeling: n =", len(df))
print("Class balance:\n", df["diabetes_binary"].value_counts())

# create gender label (0: male, 1: female)
df["gender"] = df["gender"].map({"Male": 0, "Female": 1})

After merging with HbA1c, shape: (11303, 20)
Final dataset for modeling: n = 11303
Class balance:
 diabetes_binary
0    10244
1     1059
Name: count, dtype: int64


In [9]:
# -----------------------------
# 8. Select model features & train/test split
# -----------------------------
# Choose features from aggregated step features + some demographics if present
# Try: mean_daily_steps, sd_daily_steps, pct_valid_days, Age, BMI, RIAGENDR (gender), RIDAGEYR (age)
# Check for column existence and create X accordingly:
print(df.columns.tolist())
feat_candidates = [
    "mean_daily_steps", "median_daily_steps", "sd_daily_steps", "pct_valid_days",
    # optional AC/MIMS
    "mean_daily_AC", "mean_daily_mims",
    # demographics (column names vary in subject-info.csv; check common ones)
    "gender", "age_in_years_at_screening", "full_sample_2_year_mec_exam_weight"
]
# pick present features
features = [f for f in feat_candidates if f in df.columns]
print("Using features:", features)

X = df[features].copy()
y = df["diabetes_binary"].astype(int)

# simple fillna (median) for any leftover missing values
X = X.fillna(X.median())

# stratified splits
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.40, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print("Split sizes (train/val/test):", len(X_train), len(X_val), len(X_test))

['SEQN', 'mean_daily_steps', 'median_daily_steps', 'sd_daily_steps', 'valid_day_count', 'total_days', 'pct_valid_days', 'mean_daily_AC', 'sd_daily_AC', 'mean_daily_mims', 'data_release_cycle', 'gender', 'age_in_years_at_screening', 'full_sample_2_year_interview_weight', 'full_sample_2_year_mec_exam_weight', 'masked_variance_pseudo_psu', 'masked_variance_pseudo_stratum', 'LBXGH_G', 'LBXGH_H', 'HBA1C', 'diabetes_binary']
Using features: ['mean_daily_steps', 'median_daily_steps', 'sd_daily_steps', 'pct_valid_days', 'mean_daily_AC', 'mean_daily_mims', 'gender', 'age_in_years_at_screening', 'full_sample_2_year_mec_exam_weight']
Split sizes (train/val/test): 6781 2261 2261


In [10]:
# -----------------------------
# 9. Train Random Forest and thresholding as before
# -----------------------------
model = RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=42)
model.fit(X_train, y_train)

# choose threshold by maximizing F1 on validation
y_val_prob = model.predict_proba(X_val)[:, 1]
best_t, best_f1 = 0.0, -1.0
for t in np.linspace(0.05, 0.95, 200):
    preds = (y_val_prob >= t).astype(int)
    score = f1_score(y_val, preds)
    if score > best_f1:
        best_f1 = score
        best_t = t
print("Best validation threshold:", best_t, "F1:", best_f1)

# lower threshold to boost recall
adjusted_t = best_t * 0.8
print("Adjusted (recall-boosted) threshold:", adjusted_t)

y_test_prob = model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_prob >= adjusted_t).astype(int)

print("Test Precision:", precision_score(y_test, y_test_pred))
print("Test Recall:", recall_score(y_test, y_test_pred))
print("Test F1:", f1_score(y_test, y_test_pred))
print("Test PR AUC:", average_precision_score(y_test, y_test_prob))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

Best validation threshold: 0.18115577889447237 F1: 0.27722772277227725
Adjusted (recall-boosted) threshold: 0.1449246231155779
Test Precision: 0.21761658031088082
Test Recall: 0.5943396226415094
Test F1: 0.3185840707964602
Test PR AUC: 0.22100175445963807
Confusion Matrix:
 [[1596  453]
 [  86  126]]
