# NHANES import code

In [None]:
import requests
import tempfile
import os
import pyreadstat
import pandas as pd

#------------------------------------------------------
# 1. Loader that downloads any NHANES .xpt file safely
#------------------------------------------------------
def load_nhanes_xpt(file, year="2015"):
    """
    file: 'DEMO_I.xpt'
    year: '2015'
    Loads NHANES data from the real 'Data/Nhanes/Public' server.
    """
    url = f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{year}/DataFiles/{file}"

    # download with browser-like headers
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0 Safari/537.36"
        )
    }
    r = requests.get(url, headers=headers)
    r.raise_for_status()

    # detect HTML error page
    if b"<html" in r.content[:200].lower():
        preview = r.content[:500].decode(errors="ignore")
        raise ValueError(f"HTML returned instead of XPT:\n{url}\n\nPreview:\n{preview}")

    # write temp file
    with tempfile.NamedTemporaryFile(suffix=".xpt", delete=False) as tmp:
        tmp.write(r.content)
        tmp_path = tmp.name

    # read XPT
    try:
        df, meta = pyreadstat.read_xport(tmp_path)
    finally:
        os.remove(tmp_path)

    return df


#------------------------------------------------------
# 2. List all NHANES 2015 files you want to merge
#------------------------------------------------------
nhanes_files = {
    "DEMO":      "DEMO_I.xpt",
    "HDL":       "HDL_I.xpt",
    "TCHOL":     "TCHOL_I.xpt",
    "TRIGLY":    "TRIGLY_I.xpt",
    "GLU":       "GLU_I.xpt",
    "INS":       "INS_I.xpt",
    "DPQ":       "DPQ_I.xpt",
    "SLQ":       "SLQ_I.xpt",
    "DR1TOT":    "DR1TOT_I.xpt",
    "DR1IFF":    "DR1IFF_I.xpt",
    "PAQ":       "PAQ_I.xpt",
    "BPX":       "BPX_I.xpt",
    "BIOPRO":    "BIOPRO_I.xpt",
    "ALB_CR":    "ALB_CR_I.xpt"
}


#------------------------------------------------------
# 3. Download all files into a dictionary of DataFrames
#------------------------------------------------------
all_dfs = {}

for name, fname in nhanes_files.items():
    print(f"Downloading {name} ({fname})...")
    df = load_nhanes_xpt(fname, "2015")
    all_dfs[name] = df
    print(f"Loaded {name}: {df.shape} rows/columns")


#------------------------------------------------------
# 4. Merge all datasets on SEQN
#------------------------------------------------------
print("\nMerging all datasets...")

merged = None

for name, df in all_dfs.items():
    if merged is None:
        merged = df
    else:
        merged = pd.merge(merged, df, on="SEQN", how="outer")

print("Final merged shape:", merged.shape)


#------------------------------------------------------
# 5. Preview merged data
#------------------------------------------------------
print("\nPreview of merged dataset:")
print(merged.head())


In [None]:
import numpy as np
import pandas as pd

# Make a copy so you can debug without overwriting the original
df = merged.copy()

# -----------------------------------------------------------
# 1. Rename common NHANES variables for clarity
# -----------------------------------------------------------

rename_map = {
    "RIAGENDR": "sex",
    "RIDAGEYR": "age",
    "RIDRETH1": "race_ethnicity",
    "INDFMPIR": "poverty_income_ratio",
    "LBXGLU":   "fasting_glucose",
    "LBXINS":   "fasting_insulin",
    "LBXTC":    "total_cholesterol",
    "LBDHDL":   "hdl_cholesterol",
    "LBXTR":    "triglycerides",
    "BPXSY1":   "systolic_1",
    "BPXDI1":   "diastolic_1",
    "SLD010H":  "sleep_hours",
}

df.rename(columns={k:v for k,v in rename_map.items() if k in df.columns}, inplace=True)

# -----------------------------------------------------------
# 2. Derived lipid values
# -----------------------------------------------------------

# Friedewald LDL (only valid TG < 400 mg/dL)
def friedewald_ldl(row):
    tg = row["triglycerides"]
    if pd.isna(tg) or tg >= 400:
        return np.nan
    return row["total_cholesterol"] - row["hdl_cholesterol"] - tg/5

if "total_cholesterol" in df.columns and "hdl_cholesterol" in df.columns:
    df["ldl_cholesterol"] = df.apply(friedewald_ldl, axis=1)

# ApoB not measured in NHANES 2015 – we can estimate it (accepted approximation)
# ApoB ≈ 0.65 * LDL + 0.1 * TG
if "ldl_cholesterol" in df.columns and "triglycerides" in df.columns:
    df["apob_est"] = 0.65 * df["ldl_cholesterol"] + 0.1 * df["triglycerides"]

# -----------------------------------------------------------
# 3. HOMA-IR
# -----------------------------------------------------------
# Formula: (fasting insulin (µU/mL) × fasting glucose (mg/dL)) / 405

if "fasting_glucose" in df.columns and "fasting_insulin" in df.columns:
    df["homa_ir"] = (df["fasting_insulin"] * df["fasting_glucose"]) / 405

# -----------------------------------------------------------
# 4. Blood pressure aggregates
# -----------------------------------------------------------

# Mean of first reading (or extend to multiple if desired)
sbp_cols = [c for c in df.columns if c.startswith("BPXSY")]
dbp_cols = [c for c in df.columns if c.startswith("BPXDI")]

if len(sbp_cols) > 0:
    df["sbp"] = df[sbp_cols].mean(axis=1)
if len(dbp_cols) > 0:
    df["dbp"] = df[dbp_cols].mean(axis=1)

# Pulse pressure
df["pulse_pressure"] = df["sbp"] - df["dbp"]

# -----------------------------------------------------------
# 5. Kidney function (eGFR, CKD-EPI 2021 equation)
# -----------------------------------------------------------

# NHANES 2015 serum creatinine = LBXSCR (in BIOPRO_I)
if "LBXSCR" in df.columns:
    scr = df["LBXSCR"]
    sex = df["sex"]

    # CKD-EPI 2021 (race-free)
    # separate constants for men/women
    k = np.where(sex == 2, 0.7, 0.9)
    alpha = np.where(sex == 2, -0.241, -0.302)
    min_part = np.minimum(scr / k, 1) ** alpha
    max_part = np.maximum(scr / k, 1) ** -1.200
    df["egfr"] = 142 * min_part * max_part * (0.9938 ** df["age"])

# -----------------------------------------------------------
# 6. Sleep metrics (SLQ)
# -----------------------------------------------------------

# sleep_hours already renamed in step 1
if "sleep_hours" in df.columns:
    # Create a basic sleep quality score:
    # 7–9 hours = optimal; <6 or >9 = penalized.
    def sleep_score(x):
        if pd.isna(x):
            return np.nan
        if 7 <= x <= 9:
            return 100
        if 6 <= x < 7 or 9 < x <= 10:
            return 80
        return 50  # short or long sleep

    df["sleep_score"] = df["sleep_hours"].apply(sleep_score)

# -----------------------------------------------------------
# 7. Physical Activity
# -----------------------------------------------------------
# NHANES PAQ: moderate/vigorous minutes per week are reported as single-item responses
# For demonstration, create an estimate based on NHANES PAQ conventions.

if "PAD615" in df.columns and "PAD630" in df.columns:
    # PAD615 = vigorous activity days/week
    # PAD630 = vigorous activity minutes/day
    vig_min = df["PAD615"] * df["PAD630"]

    # PAD645 = moderate days/week
    # PAD660 = moderate minutes/day
    if "PAD645" in df.columns and "PAD660" in df.columns:
        mod_min = df["PAD645"] * df["PAD660"]
    else:
        mod_min = np.nan

    df["mvpa_min_week"] = vig_min * 2 + mod_min  # double weight vigorous

# -----------------------------------------------------------
# 8. PHQ-9 total
# -----------------------------------------------------------

phq_cols = [c for c in df.columns if c.startswith("DPQ0")]
if len(phq_cols) > 0:
    df["phq9_total"] = df[phq_cols].sum(axis=1)

# -----------------------------------------------------------
# 9. Diet scores (simplified)
# -----------------------------------------------------------

# DR1TOT_I.xpt contains:
# - DR1TKCAL = total kcal
# - DR1TTFAT = total fat
# - DR1TSODI = sodium
# - DR1TSUGR = sugars
# and more

if "DR1TKCAL" in df.columns:
    # very simple diet quality: calories from sugar + sodium load
    df["diet_penalty"] = (
        (df["DR1TSUGR"] / df["DR1TKCAL"]) * 100 +   # % calories from sugar
        df["DR1TSODI"] / 2300 * 100                 # sodium relative to daily target
    )

# Placeholder for rPDQS scoring (we can write a full version if you want)
# df["rpdqs_score"] = ...

# -----------------------------------------------------------
# 10. Basic cleaning
# -----------------------------------------------------------
df.replace({7777: np.nan, 9999: np.nan, 999999: np.nan}, inplace=True)

# Done.
df.head()


# build rPDQS score

In [None]:
import numpy as np
import pandas as pd

# -----------------------------------------------------------
# 1. Build NHANES food category map → rPDQS groups
# -----------------------------------------------------------

# NHANES FPED (Food Patterns) categories are encoded in DR1IFF:
# There is a variable called "DR1IFDCD" (food code)
# and "DR1IGRMS" (gram amount consumed)

# Below is a simplified mapping widely used in NHANES dietary research.
# For full accuracy we could also merge FPED databases — available if wanted.

rpdqs_map = {
    "dark_green_veg":       [6310, 6320],
    "other_veg":            [6110, 6120, 6130, 6140],
    "citrus_melons_berries":[6210, 6220, 6230],
    "other_fruit":          [6240, 6250],
    "legumes":              [7510],
    "whole_grains":         [5710, 5720],
    "nuts_seeds":           [7410, 7420],
    "low_fat_dairy":        [1310, 1320],
    "fish":                 [2710, 2720],

    # Negative groups
    "red_meat":             [2510, 2520],
    "processed_meat":       [2530, 2540],
    "refined_grains":       [5610, 5620],
    "ssb":                  [9310, 9320, 9330],   # sugar-sweetened beverages
    "fried_foods":          [6410, 6420]
}



In [None]:
df_rpdqs = df  # make df_rpdqs point to the engineered dataframe

# ML longevity score 2.1
only using variables that are available in NHANES

In [None]:
import numpy as np
import pandas as pd

df = merged.copy()

# ---------------------------------------------------------
# 0. Basic demographics
# ---------------------------------------------------------
df["sex"] = df["RIAGENDR"]                          # 1=Male, 2=Female
df["race_ethnicity"] = df["RIDRETH3"]               # Used in Longevity Score v1.1

# Age (NHANES 2015–2016 uses RIDEXAGM, months)
df["age"] = df["RIDEXAGM"] / 12

# Poverty ratio
df["poverty_income_ratio"] = df["INDFMIN2"]

# ---------------------------------------------------------
# 1. Rename biochemical variables
# ---------------------------------------------------------
rename_map = {
    "LBXGLU": "fasting_glucose",
    "LBXIN": "fasting_insulin",
    "LBDHDD": "hdl_cholesterol",
    "LBDTCSI": "total_cholesterol",   # fixed
    "LBDTRSI": "triglycerides",       # fixed
    "LBXSCR": "serum_creatinine"
}

df.rename(columns={k:v for k,v in rename_map.items() if k in df.columns}, inplace=True)

# ---------------------------------------------------------
# 2. LDL (Friedewald) + ApoB
# ---------------------------------------------------------
def friedewald(row):
    tg = row["triglycerides"]
    if pd.isna(tg) or tg >= 400:
        return np.nan
    return row["total_cholesterol"] - row["hdl_cholesterol"] - tg/5

df["ldl_cholesterol"] = df.apply(friedewald, axis=1)
df["apob_est"] = 0.65 * df["ldl_cholesterol"] + 0.1 * df["triglycerides"]

# ---------------------------------------------------------
# 3. HOMA-IR
# ---------------------------------------------------------
df["homa_ir"] = (df["fasting_insulin"] * df["fasting_glucose"]) / 405

# ---------------------------------------------------------
# 4. Sleep (SLQ030 = valid sleep variable for 2015–2016)
# ---------------------------------------------------------
sleep_var = "SLQ030"

df["sleep_hours"] = df[sleep_var]

df["sleep_hours_clean"] = df[sleep_var].where(
    (df[sleep_var] >= 3) & (df[sleep_var] <= 14),
    np.nan
)

def compute_sleep_score(x):
    if pd.isna(x): return np.nan
    if 7 <= x <= 9: return 100
    if 6 <= x < 7 or 9 < x <= 10: return 80
    return 50

df["sleep_score"] = df["sleep_hours_clean"].apply(compute_sleep_score)

# ---------------------------------------------------------
# 5. PHQ-9
# ---------------------------------------------------------
phq_cols = [col for col in df.columns if col.startswith("DPQ0")]
df["phq9_total"] = df[phq_cols].sum(axis=1)

# ---------------------------------------------------------
# 6. Physical activity (leisure MSPA)
# ---------------------------------------------------------
if all(c in df.columns for c in ["PAD615","PAD630","PAD645","PAD660"]):
    vig = df["PAD615"] * df["PAD630"]
    mod = df["PAD645"] * df["PAD660"]
    df["mvpa_min_week"] = vig * 2 + mod

# ---------------------------------------------------------
# 7. eGFR — CKD-EPI 2021
# ---------------------------------------------------------
scr = df["serum_creatinine"]
k = np.where(df["sex"] == 2, 0.7, 0.9)
alpha = np.where(df["sex"] == 2, -0.241, -0.302)

min_val = (scr / k).clip(upper=1) ** alpha
max_val = (scr / k).clip(lower=1) ** -1.200
df["egfr"] = 142 * min_val * max_val * (0.9938 ** df["age"])

# ---------------------------------------------------------
# 8. Minimal rPDQS — warn: DR1IFDCD does NOT map to rPDQS correctly
# ---------------------------------------------------------
df["servings"] = df["DR1IGRMS"] / 100

def classify_food(code):
    # placeholder; real rPDQS requires FNDDS → food groups mapping
    return None

df["rpdqs_cat"] = df["DR1IFDCD"].apply(classify_food)

serving_table = df.groupby(["SEQN","rpdqs_cat"])["servings"].sum().unstack().fillna(0)
df = df.merge(serving_table, on="SEQN", how="left")

# scoring for placeholder categories
def pos_score(s): return pd.qcut(s, 5, labels=[0,1,2,3,4], duplicates="drop").astype(float)
def neg_score(s): return pd.qcut(s, 5, labels=[4,3,2,1,0], duplicates="drop").astype(float)

# no-op because categories are empty
df["rpdqs_total"] = 0
df["rpdqs_normalized"] = 0

# ---------------------------------------------------------
# Final dataset
# ---------------------------------------------------------
df_final = df.copy()
print("FINAL SHAPE:", df_final.shape)
df_final.head()


In [None]:
fp = fped.copy()

# Helper: safe getter (returns 0 if column not present)
def col(df, name):
    return df[name] if name in df.columns else 0

# ---------------------------------------------------------
# Build rPDQS components directly from FPED servings
# (NO kcal normalization needed)
# ---------------------------------------------------------

healthy_groups = {
    "fruit":          ["DR1T_F_TOTAL"],
    "vegetables":     ["DR1T_V_TOTAL"],
    "whole_grains":   ["DR1T_G_WHOLE"],
    "nuts_seeds":     ["DR1T_PF_NUTSDS"],
    "legumes":        ["DR1T_PF_LEGUMES"],
    "fish":           ["DR1T_PF_SEAFD_HI","DR1T_PF_SEAFD_LOW"],
    "lowfat_dairy":   ["DR1T_D_MILK","DR1T_D_YOGURT"],
}

unhealthy_groups = {
    "red_meat":       ["DR1T_PF_MEAT"],
    "processed_meat": ["DR1T_PF_CUREDMEAT"],
    "refined_grains": ["DR1T_G_REFINED"],
    "ssb":            ["DR1T_A_DRINKS"],
    "fried_foods":    ["DR1T_V_STARCHY_POTATO"],  # NHANES proxy
}

# Collapse groups
for name, cols in healthy_groups.items():
    fp[name] = fp[cols].sum(axis=1)

for name, cols in unhealthy_groups.items():
    fp[name] = fp[cols].sum(axis=1)

# ---------------------------------------------------------
# Safe quantile scoring (your working functions)
# ---------------------------------------------------------

def score_positive(series):
    s = series.copy()

    if s.nunique(dropna=True) < 2:
        return pd.Series(np.nan, index=s.index)

    ranks = s.rank(method="average", pct=True)

    for bins in [5,4,3,2]:
        try:
            labels = list(range(bins))
            out = pd.qcut(ranks, bins, labels=labels, duplicates="drop")
            return out.astype(float) * (4 / (bins - 1))
        except:
            continue

    return pd.Series(np.nan, index=s.index)

def score_negative(series):
    pos = score_positive(series)
    return 4 - pos if not pos.isna().all() else pos

# Apply scoring
for g in healthy_groups:
    fp[f"rpdqs_{g}"] = score_positive(fp[g])

for g in unhealthy_groups:
    fp[f"rpdqs_{g}"] = score_negative(fp[g])

# Final score
score_cols = [c for c in fp.columns if c.startswith("rpdqs_")]
fp["rpdqs_total"] = fp[score_cols].sum(axis=1)
fp["rpdqs_normalized"] = fp["rpdqs_total"] / 52 * 100


In [None]:
print(score_cols)
fp[score_cols].head()
