In [5]:
import pandas as pd

def load_lmf(path_to_dat):
    colspecs = [
        (0, 6),     # SEQN (key)
        (6, 7),     # Eligible
        (7, 8),     # Mortality status
        (8, 13),    # Follow-up time (months)
        (13, 16),   # Underlying cause of death (ICD10)
        (16, 17),   # Heart disease death flag
        (17, 18),   # Cancer death flag
        (18, 19),   # Chronic lower resp death flag
        (19, 20),   # Cerebrovascular death flag
        (20, 21),   # Diabetes death flag
    ]
    colnames = [
        "SEQN",
        "eligible",
        "mortstat",
        "permth_exm",
        "ucod_icd10",
        "heart_dth",
        "cancer_dth",
        "clrd_dth",
        "stroke_dth",
        "diabetes_dth"
    ]

    df = pd.read_fwf(path_to_dat, colspecs=colspecs, names=colnames)
    df["SEQN"] = df["SEQN"].astype(int)
    return df


In [6]:
lmf = load_lmf("/Users/johnmcdonnell/Desktop/git repositories/quotient health NHANES/NHANES_2017_2018_MORT_2019_PUBLIC.dat")
print(lmf.head())

    SEQN  eligible  mortstat  permth_exm  ucod_icd10  heart_dth  cancer_dth  \
0  93703       NaN       NaN         NaN         2.0        NaN         NaN   
1  93704       NaN       NaN         NaN         2.0        NaN         NaN   
2  93705       NaN       NaN         NaN        10.0        NaN         NaN   
3  93706       NaN       NaN         NaN        10.0        NaN         NaN   
4  93707       NaN       NaN         NaN         2.0        NaN         NaN   

   clrd_dth stroke_dth diabetes_dth  
0       NaN          .            .  
1       NaN          .            .  
2       NaN          .            .  
3       NaN          .            .  
4       NaN          .            .  


In [8]:
import numpy as np
import pandas as pd

# ------------------------------------------------------------
# CLEAN AND TIDY PUBLIC-USE LMF (NHANES LINKED MORTALITY FILE)
# ------------------------------------------------------------

lmf_clean = lmf.copy()

# 1. Convert Stata missing "." to NaN (pandas already does most of this)
lmf_clean = lmf_clean.replace({".": np.nan})

# 2. mortstat: 1 = deceased, 0 = alive
lmf_clean["mortstat"] = pd.to_numeric(lmf_clean["mortstat"], errors="coerce")

# 3. Cause-of-death flags â†’ 0/1 integers
cod_cols = ["heart_dth", "cancer_dth", "clrd_dth", "stroke_dth", "diabetes_dth"]

for col in cod_cols:
    if col in lmf_clean.columns:
        lmf_clean[col] = pd.to_numeric(lmf_clean[col], errors="coerce")
        lmf_clean[col] = lmf_clean[col].fillna(0).astype(int)

# 4. permth_exm: months from exam to death or censor date
lmf_clean["permth_exm"] = pd.to_numeric(lmf_clean["permth_exm"], errors="coerce")

# 5. Create simple binary mortality outcome (for modeling)
lmf_clean["death"] = lmf_clean["mortstat"].fillna(0).astype(int)

# 6. Optional: rename columns for readability
lmf_clean = lmf_clean.rename(columns={
    "permth_exm": "months_followup",
    "mortstat": "mortality_status",
    "ucod_icd10": "ucod_category"
})

lmf_clean.head()



Unnamed: 0,SEQN,eligible,mortality_status,months_followup,ucod_category,heart_dth,cancer_dth,clrd_dth,stroke_dth,diabetes_dth,death
0,93703,,,,2.0,0,0,0,0,0,0
1,93704,,,,2.0,0,0,0,0,0,0
2,93705,,,,10.0,0,0,0,0,0,0
3,93706,,,,10.0,0,0,0,0,0,0
4,93707,,,,2.0,0,0,0,0,0,0


In [9]:
lmf_clean.to_csv("nhanes_lmf_clean.csv", index=False)