In [79]:
import pandas as pd
import numpy as np
from pathlib import Path

In [80]:
#Enforce copy on write
pd.options.mode.copy_on_write = True

Data far too large to upload - download from https://www.cdc.gov/brfss/annual_data/annual_2014.html (SAS format)


Ensure file name is LLCP2014.XPT (i had issues with file saving with extra space at end)

In [81]:
cwd = Path().cwd()
project_folder = cwd.parent
data_path = Path("data/LLCP2014.XPT")
file = project_folder / data_path

if not file.exists():
    raise FileNotFoundError("Data files not found. Please ensure the data files are in the correct directory.")

df = pd.read_sas(file)

print(df.head())


   _STATE  FMONTH        IDATE IMONTH   IDAY    IYEAR  DISPCODE         SEQNO  \
0     1.0     1.0  b'01172014'  b'01'  b'17'  b'2014'    1100.0  2.014000e+09   
1     1.0     1.0  b'01072014'  b'01'  b'07'  b'2014'    1100.0  2.014000e+09   
2     1.0     1.0  b'01092014'  b'01'  b'09'  b'2014'    1100.0  2.014000e+09   
3     1.0     1.0  b'01072014'  b'01'  b'07'  b'2014'    1100.0  2.014000e+09   
4     1.0     1.0  b'01162014'  b'01'  b'16'  b'2014'    1100.0  2.014000e+09   

           _PSU  CTELENUM  ...  _FOBTFS  _CRCREC  _AIDTST3  _IMPEDUC  \
0  2.014000e+09       1.0  ...      2.0      1.0       2.0       5.0   
1  2.014000e+09       1.0  ...      2.0      2.0       2.0       4.0   
2  2.014000e+09       1.0  ...      2.0      2.0       2.0       6.0   
3  2.014000e+09       1.0  ...      2.0      1.0       2.0       6.0   
4  2.014000e+09       1.0  ...      2.0      1.0       2.0       5.0   

   _IMPMRTL  _IMPHOME  RCSBRAC1  RCSRACE1  RCHISLA1  RCSBIRTH  
0       1.0     

Below I define variables deemed relevant to the data analysis. 

In [82]:
# --- Minimal 2014 BRFSS variable groups ---

# Sleep exposure
sleep_vars_2014 = [
    "SLEPTIM1",       # hours of sleep in a 24-hour period
]

# Core demographics (sex, age, SES, race/ethnicity, geography)
demo_vars_2014 = [
    "SEX",            # sex
    "_AGEG5YR",       # 5-year age groups
    "EDUCA",          # education
    "INCOME2",        # household income categories
    "_RACEGR3",       # race groups
    "_HISPANC",       # hispanic ethnicity
    "_STATE",         # state FIPS
]

# Body size 
bmi_vars_2014 = [
    "_BMI5",          # BMI *100
    "_BMI5CAT",       # BMI categories
]

# Physical activity (simple indicator of any leisure-time PA)
activity_vars_2014 = [
    "_TOTINDA",       # any vs no leisure-time physical activity
]

# Smoking & alcohol: compact, derived indicators plus binge behavior
smoking_alcohol_vars_2014 = [
    "_SMOKER3",       # 4-level smoking status
    "_RFSMOK3",       # current smoker risk indicator
    "DRNK3GE5",       # binge drinking (yes/no)
    "_RFBING5",       # binge drinking flag (derived)
]

# General health / mental health
health_status_vars_2014 = [
    "GENHLTH",        # self-rated health
    "MENTHLTH",       # days mental health not good (0–30)
]

# Major chronic conditions (keep a small, high-yield set)
chronic_condition_vars_2014 = [
    "CVDINFR4",       # ever heart attack
    "CVDSTRK3",       # ever stroke
    "CHCCOPD1",       # COPD/emphysema/chronic bronchitis
    "ASTHMA3",        # ever asthma
    "DIABETE3",       # diabetes
    "ADDEPEV2",       # depressive disorder
]

# Combine into a candidate list and subset dataframe
candidate_cols_2014 = (
    sleep_vars_2014
    + demo_vars_2014
    + bmi_vars_2014
    + activity_vars_2014
    + smoking_alcohol_vars_2014
    + health_status_vars_2014
    + chronic_condition_vars_2014
)

# Keep only those that actually exist in loaded 2014 dataframe
relevant_cols_2014 = [c for c in candidate_cols_2014 if c in df.columns]

print("Missing from 2014 df:", [c for c in candidate_cols_2014 if c not in df.columns])

df_reduced_2014 = df[relevant_cols_2014].copy()

df_reduced_2014.shape

Missing from 2014 df: []


(464664, 23)

Data Cleaning

In [83]:
# SLEPTIM1: valid 1–24; 77 = don't know, 99 = refused
if "SLEPTIM1" in df_reduced_2014.columns:
    df_reduced_2014["sleep_hours"] = (
        df_reduced_2014["SLEPTIM1"]
        .replace({77: np.nan, 99: np.nan})
        .astype("float")
    )

    # remove impossible values
    df_reduced_2014.loc[(df_reduced_2014["sleep_hours"] < 1) | (df_reduced_2014["sleep_hours"] > 24), "sleep_hours"] = np.nan

    # Short sleep indicator (< 7 hours)
    df_reduced_2014["short_sleep"] = np.where(df_reduced_2014["sleep_hours"] < 7, 1, 0)
    df_reduced_2014.loc[df_reduced_2014["sleep_hours"].isna(), "short_sleep"] = np.nan


In [84]:
# BMI from _BMI5 (BMI*100, 9999 = missing)
if "_BMI5" in df_reduced_2014.columns:
    df_reduced_2014["bmi"] = df_reduced_2014["_BMI5"].astype("float")
    df_reduced_2014.loc[df_reduced_2014["bmi"] >= 9999, "bmi"] = np.nan
    df_reduced_2014["bmi"] = df_reduced_2014["bmi"] / 100.0

    # Obesity indicator (BMI >= 30)
    df_reduced_2014["obese"] = np.where(df_reduced_2014["bmi"] >= 30, 1, 0)
    df_reduced_2014.loc[df_reduced_2014["bmi"].isna(), "obese"] = np.nan


In [85]:
# Sex: 1 = male, 2 = female, 7/9 = missing
if "SEX" in df_reduced_2014.columns:
    sex_clean = df_reduced_2014["SEX"].replace({7: np.nan, 9: np.nan})
    df_reduced_2014["sex_male"] = np.where(sex_clean == 1, 1,
                       np.where(sex_clean == 2, 0, np.nan))

# Education (EDUCA: 1–6 valid, 9 = DK/refused)
if "EDUCA" in df_reduced_2014.columns:
    edu_clean = df_reduced_2014["EDUCA"].replace({9: np.nan})
    # 0 = ≤HS, 1 = some college, 2 = college+
    df_reduced_2014["educ_3cat"] = np.select(
        [
            edu_clean.isin([1, 2, 3]),   # Never attended – HS grad
            edu_clean.isin([4, 5]),      # Some college/technical school
            edu_clean == 6,              # College grad+
        ],
        [0, 1, 2],
        default=np.nan
    )

# Income (INCOME2: 1–8, 77/99 = missing)
if "INCOME2" in df_reduced_2014.columns:
    inc_clean = df_reduced_2014["INCOME2"].replace({77: np.nan, 99: np.nan})
    # Low income (<$35k: 1–4), high income (5–8)
    df_reduced_2014["low_income"] = np.where(inc_clean.isin([1, 2, 3, 4]), 1,
                         np.where(inc_clean.isin([5, 6, 7, 8]), 0, np.nan))

# Race/ethnicity: _RACEGR3 (1=White NH, 2=Black NH, 3=Other NH, 4=Multiracial NH, 5=Hispanic)
if "_RACEGR3" in df_reduced_2014.columns:
    race_clean = df_reduced_2014["_RACEGR3"].replace({9: np.nan})
    df_reduced_2014["race_white_nh"] = np.where(race_clean == 1, 1,
                            np.where(race_clean.isin([2, 3, 4, 5]), 0, np.nan))
    df_reduced_2014["race_hispanic"] = np.where(race_clean == 5, 1,
                            np.where(race_clean.isin([1, 2, 3, 4]), 0, np.nan))


In [86]:
# _TOTINDA: 1 = any leisure-time PA, 2 = none, 9 = DK/refused
if "_TOTINDA" in df_reduced_2014.columns:
    pa_clean = df_reduced_2014["_TOTINDA"].replace({9: np.nan})
    df_reduced_2014["any_leisure_pa"] = np.where(pa_clean == 1, 1,
                             np.where(pa_clean == 2, 0, np.nan))


In [87]:
# _SMOKER3: 1=current every day, 2=current some days, 3=former, 4=never, 9=DK/ref
if "_SMOKER3" in df_reduced_2014.columns:
    sm_clean = df_reduced_2014["_SMOKER3"].replace({9: np.nan})
    df_reduced_2014["smoker_current"] = np.where(sm_clean.isin([1, 2]), 1,
                             np.where(sm_clean.isin([3, 4]), 0, np.nan))
    df_reduced_2014["smoker_ever"] = np.where(sm_clean.isin([1, 2, 3]), 1,
                          np.where(sm_clean == 4, 0, np.nan))

# _RFBING5: 1 = no binge, 2 = binge, 9 = DK/ref
if "_RFBING5" in df_reduced_2014.columns:
    binge_clean = df_reduced_2014["_RFBING5"].replace({9: np.nan})
    df_reduced_2014["binge_drink"] = np.where(binge_clean == 2, 1,
                          np.where(binge_clean == 1, 0, np.nan))


In [88]:
# GENHLTH: 1=excellent ... 5=poor, 7/9 = DK/ref
if "GENHLTH" in df_reduced_2014.columns:
    gh_clean = df_reduced_2014["GENHLTH"].replace({7: np.nan, 9: np.nan})
    df_reduced_2014["fairpoor_health"] = np.where(gh_clean.isin([4, 5]), 1,
                              np.where(gh_clean.isin([1, 2, 3]), 0, np.nan))

# MENTHLTH: 0–30, 88 = none, 77/99 = DK/ref
if "MENTHLTH" in df_reduced_2014.columns:
    mental = df_reduced_2014["MENTHLTH"].replace({77: np.nan, 99: np.nan, 88: 0}).astype("float")
    df_reduced_2014["mental_unhealthy_days"] = mental
    df_reduced_2014["frequent_mental_distress"] = np.where(mental >= 14, 1,
                                       np.where(mental < 14, 0, np.nan))


In [89]:
# Binary ever-diagnosed variables: 1=yes, 2=no, 7/9=missing
for var in ["CVDINFR4", "CVDSTRK3", "CHCCOPD1", "ASTHMA3", "ADDEPEV2"]:
    if var in df_reduced_2014.columns:
        x = df_reduced_2014[var].replace({7: np.nan, 9: np.nan})
        new_name = var.lower() + "_ever"
        df_reduced_2014[new_name] = np.where(x == 1, 1,
                         np.where(x == 2, 0, np.nan))

# DIABETE3: 1=yes, 2=yes (pregnant), 3=no, 4=pre-diabetes; 7/9=missing
if "DIABETE3" in df_reduced_2014.columns:
    dia = df_reduced_2014["DIABETE3"].replace({7: np.nan, 9: np.nan})
    df_reduced_2014["diabetes_any"] = np.where(dia.isin([1, 2]), 1,
                           np.where(dia.isin([3, 4]), 0, np.nan))
    df_reduced_2014["prediabetes"] = np.where(dia == 4, 1,
                          np.where(dia.isin([1, 2, 3]), 0, np.nan))


In [90]:
# Keep only respondents with valid sleep_hours
if "sleep_hours" in df_reduced_2014.columns:
    df_reduced_2014 = df_reduced_2014[df_reduced_2014["sleep_hours"].notna()].copy()

df_reduced_2014.shape

(458172, 46)

In [91]:
print(df_reduced_2014.columns)

Index(['SLEPTIM1', 'SEX', '_AGEG5YR', 'EDUCA', 'INCOME2', '_RACEGR3',
       '_HISPANC', '_STATE', '_BMI5', '_BMI5CAT', '_TOTINDA', '_SMOKER3',
       '_RFSMOK3', 'DRNK3GE5', '_RFBING5', 'GENHLTH', 'MENTHLTH', 'CVDINFR4',
       'CVDSTRK3', 'CHCCOPD1', 'ASTHMA3', 'DIABETE3', 'ADDEPEV2',
       'sleep_hours', 'short_sleep', 'bmi', 'obese', 'sex_male', 'educ_3cat',
       'low_income', 'race_white_nh', 'race_hispanic', 'any_leisure_pa',
       'smoker_current', 'smoker_ever', 'binge_drink', 'fairpoor_health',
       'mental_unhealthy_days', 'frequent_mental_distress', 'cvdinfr4_ever',
       'cvdstrk3_ever', 'chccopd1_ever', 'asthma3_ever', 'addepev2_ever',
       'diabetes_any', 'prediabetes'],
      dtype='object')


In [95]:
final_columns = [
    # Sleep
    "sleep_hours",
    "short_sleep",

    # Demographics
    "sex_male",
    "_AGEG5YR",
    "educ_3cat",
    "low_income",
    "race_white_nh",
    "race_hispanic",
    
    # Health behaviors
    "any_leisure_pa",
    "smoker_current",
    "binge_drink",

    # BMI
    "bmi",
    "obese",

    # Health status
    "fairpoor_health",
    "frequent_mental_distress",

    # Chronic conditions
    "cvdinfr4_ever",
    "cvdstrk3_ever",
    "chccopd1_ever",
    "asthma3_ever",
    "addepev2_ever",
    "diabetes_any",
]

df_clean = df_reduced_2014[final_columns].copy()
print(df_clean.head())

df_clean.shape

   sleep_hours  short_sleep  sex_male  _AGEG5YR  educ_3cat  low_income  \
0          9.0          0.0       0.0       9.0        1.0         0.0   
1          6.0          1.0       1.0      11.0        1.0         1.0   
2          8.0          0.0       1.0       7.0        2.0         0.0   
3          8.0          0.0       0.0      10.0        2.0         0.0   
4          8.0          0.0       0.0      10.0        1.0         1.0   

   race_white_nh  race_hispanic  any_leisure_pa  smoker_current  ...    bmi  \
0            0.0            0.0             0.0             0.0  ...  25.51   
1            1.0            0.0             1.0             0.0  ...  24.95   
2            1.0            0.0             1.0             0.0  ...  37.30   
3            1.0            0.0             0.0             0.0  ...  54.42   
4            1.0            0.0             0.0             0.0  ...  37.12   

   obese  fairpoor_health  frequent_mental_distress  cvdinfr4_ever  \
0    0.0  

(458172, 21)

Data is clean (i think) and ready to explore!