In [428]:
# Imports

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [429]:
#Enforce copy on write
pd.options.mode.copy_on_write = True

Data far too large to upload - download from https://www.cdc.gov/brfss/annual_data/annual_2014.html (SAS format)


Ensure file name is LLCP2014.XPT (i had issues with file saving with extra space at end)

In [430]:
cwd = Path().cwd()
project_folder = cwd.parent
data_path = Path("data/LLCP2014.XPT")
file = project_folder / data_path

if not file.exists():
    raise FileNotFoundError("Data files not found. Please ensure the data files are in the correct directory.")

df = pd.read_sas(file)

print(df.head())

df.shape


   _STATE  FMONTH        IDATE IMONTH   IDAY    IYEAR  DISPCODE         SEQNO  \
0     1.0     1.0  b'01172014'  b'01'  b'17'  b'2014'    1100.0  2.014000e+09   
1     1.0     1.0  b'01072014'  b'01'  b'07'  b'2014'    1100.0  2.014000e+09   
2     1.0     1.0  b'01092014'  b'01'  b'09'  b'2014'    1100.0  2.014000e+09   
3     1.0     1.0  b'01072014'  b'01'  b'07'  b'2014'    1100.0  2.014000e+09   
4     1.0     1.0  b'01162014'  b'01'  b'16'  b'2014'    1100.0  2.014000e+09   

           _PSU  CTELENUM  ...  _FOBTFS  _CRCREC  _AIDTST3  _IMPEDUC  \
0  2.014000e+09       1.0  ...      2.0      1.0       2.0       5.0   
1  2.014000e+09       1.0  ...      2.0      2.0       2.0       4.0   
2  2.014000e+09       1.0  ...      2.0      2.0       2.0       6.0   
3  2.014000e+09       1.0  ...      2.0      1.0       2.0       6.0   
4  2.014000e+09       1.0  ...      2.0      1.0       2.0       5.0   

   _IMPMRTL  _IMPHOME  RCSBRAC1  RCSRACE1  RCHISLA1  RCSBIRTH  
0       1.0     

(464664, 279)

In [None]:
cwd = Path().cwd()
project_folder = cwd.parent
data_path = Path("data/LLCP2014.XPT")
file = project_folder / data_path

if not file.exists():
    raise FileNotFoundError("Data files not found. Please ensure the data files are in the correct directory.")

df = pd.read_sas(file)

print(df.head())

df.shape


Below I define variables deemed relevant to the data analysis. 

In [431]:
# --- Minimal 2014 BRFSS variable groups ---

# Sleep exposure
sleep_vars_2014 = [
    "SLEPTIM1",       # hours of sleep in a 24-hour period
]

# Core demographics (sex, age, SES, race/ethnicity, geography)
demo_vars_2014 = [
    "SEX",            # sex
    "_AGEG5YR",       # 5-year age groups
    "EDUCA",          # education
    "INCOME2",        # household income categories
    "_RACEGR3",       # race groups
    "_HISPANC",       # hispanic ethnicity
    "_STATE",         # state FIPS
]

# Body size 
bmi_vars_2014 = [
    "_BMI5",          # BMI *100
    "_BMI5CAT",       # BMI categories
]

# Physical activity (simple indicator of any leisure-time PA)
activity_vars_2014 = [
    "_TOTINDA",       # any vs no leisure-time physical activity
]

# Smoking & alcohol: compact, derived indicators plus binge behavior
smoking_alcohol_vars_2014 = [
    "_SMOKER3",       # 4-level smoking status
    "_RFSMOK3",       # current smoker risk indicator
    "DRNK3GE5",       # binge drinking (yes/no)
    "_RFBING5",       # binge drinking flag (derived)
]

# General health / mental health
health_status_vars_2014 = [
    "GENHLTH",        # self-rated health
    "MENTHLTH",       # days mental health not good (0–30)
]

# Major chronic conditions (keep a small, high-yield set)
chronic_condition_vars_2014 = [
    "CVDINFR4",       # ever heart attack
    "CVDSTRK3",       # ever stroke
    "CHCCOPD1",       # COPD/emphysema/chronic bronchitis
    "ASTHMA3",        # ever asthma
    "DIABETE3",       # diabetes
    "ADDEPEV2",       # depressive disorder
]

# Combine into a candidate list and subset dataframe
candidate_cols_2014 = (
    sleep_vars_2014
    + demo_vars_2014
    + bmi_vars_2014
    + activity_vars_2014
    + smoking_alcohol_vars_2014
    + health_status_vars_2014
    + chronic_condition_vars_2014
)

# Keep only those that actually exist in loaded 2014 dataframe
relevant_cols_2014 = [c for c in candidate_cols_2014 if c in df.columns]

print("Missing from 2014 df:", [c for c in candidate_cols_2014 if c not in df.columns])

df_reduced_2014 = df[relevant_cols_2014].copy()

df_reduced_2014.shape

Missing from 2014 df: []


(464664, 23)

Data Cleaning

Invalidate values outside possible range, create a short sleep indicator from datapoints with sleep < 7 hours

In [432]:
# SLEPTIM1: valid 1–24; 77 = don't know, 99 = refused
if "SLEPTIM1" in df_reduced_2014.columns:
    df_reduced_2014["sleep_hours"] = (
        df_reduced_2014["SLEPTIM1"]
        .replace({77: np.nan, 99: np.nan})
        .astype("float")
    )

    # remove impossible values
    df_reduced_2014.loc[(df_reduced_2014["sleep_hours"] < 1) | (df_reduced_2014["sleep_hours"] > 24), "sleep_hours"] = np.nan

    # Short sleep indicator (< 7 hours)
    df_reduced_2014["short_sleep"] = np.where(df_reduced_2014["sleep_hours"] < 7, 1, 0)
    df_reduced_2014.loc[df_reduced_2014["sleep_hours"].isna(), "short_sleep"] = np.nan


Adjust BMI from _BMI5, and account for missing values. 

In [433]:
# BMI from _BMI5 (BMI*100, 9999 = missing)
if "_BMI5" in df_reduced_2014.columns:
    df_reduced_2014["bmi"] = df_reduced_2014["_BMI5"].astype("float")
    df_reduced_2014.loc[df_reduced_2014["bmi"] >= 9999, "bmi"] = np.nan
    df_reduced_2014["bmi"] = df_reduced_2014["bmi"] / 100.0


Assign sex to a binary value. 

Assign education values to one of 3 values to make it ordinal. 

Create binary values for low income, race (white) and race (hispanic)

In [434]:
# Sex: 1 = male, 2 = female, 7/9 = missing
if "SEX" in df_reduced_2014.columns:
    sex_clean = df_reduced_2014["SEX"].replace({7: np.nan, 9: np.nan})
    df_reduced_2014["sex_male"] = np.where(sex_clean == 1, 1,
                       np.where(sex_clean == 2, 0, np.nan))

# Education (EDUCA: 1–6 valid, 9 = DK/refused)
if "EDUCA" in df_reduced_2014.columns:
    edu_clean = df_reduced_2014["EDUCA"].replace({9: np.nan})
    # 0 = ≤HS, 1 = some college, 2 = college+
    df_reduced_2014["educ_3cat"] = np.select(
        [
            edu_clean.isin([1, 2, 3]),   # Never attended – HS grad
            edu_clean.isin([4, 5]),      # Some college/technical school
            edu_clean == 6,              # College grad+
        ],
        [0, 1, 2],
        default=np.nan
    )

# Income (INCOME2: 1–8, 77/99 = missing)
if "INCOME2" in df_reduced_2014.columns:
    inc_clean = df_reduced_2014["INCOME2"].replace({77: np.nan, 99: np.nan})
    # Low income (<$35k: 1–4), high income (5–8)
    df_reduced_2014["low_income"] = np.where(inc_clean.isin([1, 2, 3, 4]), 1,
                         np.where(inc_clean.isin([5, 6, 7, 8]), 0, np.nan))

# Race/ethnicity: _RACEGR3 (1=White NH, 2=Black NH, 3=Other NH, 4=Multiracial NH, 5=Hispanic)
if "_RACEGR3" in df_reduced_2014.columns:
    race_clean = df_reduced_2014["_RACEGR3"].replace({9: np.nan})
    df_reduced_2014["race_white_nh"] = np.where(race_clean == 1, 1,
                            np.where(race_clean.isin([2, 3, 4, 5]), 0, np.nan))
    df_reduced_2014["race_hispanic"] = np.where(race_clean == 5, 1,
                            np.where(race_clean.isin([1, 2, 3, 4]), 0, np.nan))


take leisure physical activity and assign binary values for two different designations. 

In [435]:
# _TOTINDA: 1 = any leisure-time PA, 2 = none, 9 = DK/refused
if "_TOTINDA" in df_reduced_2014.columns:
    pa_clean = df_reduced_2014["_TOTINDA"].replace({9: np.nan})
    df_reduced_2014["any_leisure_pa"] = np.where(pa_clean == 1, 1, np.where(pa_clean == 2, 0, np.nan))


Adjust smoker categories to two binary categories. Smoker ever for individuals that have smoked before and smoker current for individuals that currently smoke. 

In [436]:
# _SMOKER3: 1=current every day, 2=current some days, 3=former, 4=never, 9=DK/ref
if "_SMOKER3" in df_reduced_2014.columns:
    sm_clean = df_reduced_2014["_SMOKER3"].replace({9: np.nan})
    df_reduced_2014["smoker_current"] = sm_clean
    df_reduced_2014["smoker_ever"] = np.where(sm_clean.isin([1, 2, 3]), 1,
                          np.where(sm_clean == 4, 0, np.nan))

# _RFBING5: 1 = no binge, 2 = binge, 9 = DK/ref
if "_RFBING5" in df_reduced_2014.columns:
    binge_clean = df_reduced_2014["_RFBING5"].replace({9: np.nan})
    df_reduced_2014["binge_drink"] = np.where(binge_clean == 2, 1,
                          np.where(binge_clean == 1, 0, np.nan))


assign the general health rating to one of two categories, good health and poor health. 

assign mental_unhealthy days to valid values of METNTHLTH column, and assign a binary value for poor mental health days being frequent (>=14) or infrequent (<14)

In [437]:
# GENHLTH: 1=excellent ... 5=poor, 7/9 = DK/ref
if "GENHLTH" in df_reduced_2014.columns:
    gh_clean = df_reduced_2014["GENHLTH"].replace({7: np.nan, 9: np.nan})
    df_reduced_2014["fairpoor_health"] = np.where(gh_clean.isin([4, 5]), 1,
                              np.where(gh_clean.isin([1, 2, 3]), 0, np.nan))

# MENTHLTH: 0–30, 88 = none, 77/99 = DK/ref
if "MENTHLTH" in df_reduced_2014.columns:
    mental = df_reduced_2014["MENTHLTH"].replace({77: np.nan, 99: np.nan, 88: 0}).astype("float")
    df_reduced_2014["mental_unhealthy_days"] = mental
    df_reduced_2014["frequent_mental_distress"] = np.where(mental >= 14, 1,
                                       np.where(mental < 14, 0, np.nan))


take all of the diagnosis variables and store them as 0 for never having had, and 1 for having had. 

Apply the same logic to different diabetes categories. 

In [438]:
# Binary ever-diagnosed variables: 1=yes, 2=no, 7/9=missing
for var in ["CVDINFR4", "CVDSTRK3", "CHCCOPD1", "ASTHMA3", "ADDEPEV2"]:
    if var in df_reduced_2014.columns:
        x = df_reduced_2014[var].replace({7: np.nan, 9: np.nan})
        new_name = var.lower() + "_ever"
        df_reduced_2014[new_name] = np.where(x == 1, 1,
                         np.where(x == 2, 0, np.nan))

# DIABETE3: 1=yes, 2=yes (pregnant), 3=no, 4=pre-diabetes; 7/9=missing
if "DIABETE3" in df_reduced_2014.columns:
    dia = df_reduced_2014["DIABETE3"].replace({7: np.nan, 9: np.nan})
    df_reduced_2014["diabetes_any"] = np.where(dia.isin([1, 2]), 1,
                           np.where(dia.isin([3, 4]), 0, np.nan))
    df_reduced_2014["prediabetes"] = np.where(dia == 4, 1,
                          np.where(dia.isin([1, 2, 3]), 0, np.nan))


Remove NaN sleep hours

In [439]:
# Keep only respondents with valid sleep_hours
if "sleep_hours" in df_reduced_2014.columns:
    df_reduced_2014 = df_reduced_2014[df_reduced_2014["sleep_hours"].notna()].copy()

df_reduced_2014.shape

(458172, 45)

rename technical names for diseases

In [440]:
df_reduced_2014 = df_reduced_2014.rename(columns={
    "cvdinfr4_ever": "heartattack_ever",
    "cvdstrk3_ever": "stroke_ever",
    "chccopd1_ever": "lungdisease_ever",
    "addepev2_ever": "depression_ever",
})


In [441]:
print(df_reduced_2014.columns)

Index(['SLEPTIM1', 'SEX', '_AGEG5YR', 'EDUCA', 'INCOME2', '_RACEGR3',
       '_HISPANC', '_STATE', '_BMI5', '_BMI5CAT', '_TOTINDA', '_SMOKER3',
       '_RFSMOK3', 'DRNK3GE5', '_RFBING5', 'GENHLTH', 'MENTHLTH', 'CVDINFR4',
       'CVDSTRK3', 'CHCCOPD1', 'ASTHMA3', 'DIABETE3', 'ADDEPEV2',
       'sleep_hours', 'short_sleep', 'bmi', 'sex_male', 'educ_3cat',
       'low_income', 'race_white_nh', 'race_hispanic', 'any_leisure_pa',
       'smoker_current', 'smoker_ever', 'binge_drink', 'fairpoor_health',
       'mental_unhealthy_days', 'frequent_mental_distress', 'heartattack_ever',
       'stroke_ever', 'lungdisease_ever', 'asthma3_ever', 'depression_ever',
       'diabetes_any', 'prediabetes'],
      dtype='object')


Establish final columns that will be used in modeling

In [442]:
final_columns = [
    # Sleep
    "sleep_hours",
    "short_sleep",

    # Demographics
    "sex_male",
    "_AGEG5YR",
    "educ_3cat",
    "low_income",
    "race_white_nh",
    "race_hispanic",
    
    # Health behaviors
    "any_leisure_pa",
    "smoker_current",
    "binge_drink",

    # BMI
    "bmi",

    # Health status
    "fairpoor_health",
    "frequent_mental_distress",

    # Chronic conditions
    "heartattack_ever",
    "stroke_ever",
    "lungdisease_ever",
    "asthma3_ever",
    "depression_ever",
    "diabetes_any",
]

df_clean = df_reduced_2014[final_columns].copy()
print(df_clean.head())

df_clean.shape

   sleep_hours  short_sleep  sex_male  _AGEG5YR  educ_3cat  low_income  \
0          9.0          0.0       0.0       9.0        1.0         0.0   
1          6.0          1.0       1.0      11.0        1.0         1.0   
2          8.0          0.0       1.0       7.0        2.0         0.0   
3          8.0          0.0       0.0      10.0        2.0         0.0   
4          8.0          0.0       0.0      10.0        1.0         1.0   

   race_white_nh  race_hispanic  any_leisure_pa  smoker_current  binge_drink  \
0            0.0            0.0             0.0             3.0          0.0   
1            1.0            0.0             1.0             4.0          0.0   
2            1.0            0.0             1.0             3.0          0.0   
3            1.0            0.0             0.0             4.0          0.0   
4            1.0            0.0             0.0             4.0          0.0   

     bmi  fairpoor_health  frequent_mental_distress  heartattack_ever  \
0

(458172, 20)

Data is clean (i think) and ready to explore!

In [443]:
# Numeric (continuous)
numeric_features = [
    "bmi",
]
numeric_features = [c for c in numeric_features if c in df_clean.columns]

# Ordinal (ordered categories)
ordinal_features = [
    "_AGEG5YR",   # 5-year age groups
    "educ_3cat",  # 0 <=HS, 1 some college, 2 college+
    "smoker_current", # 1 every day, 2 some days, 3 before, 4 never
]
ordinal_features = [c for c in ordinal_features if c in df_clean.columns]

# Binary indicators (already 0/1; pass through)
binary_features = [
    "sex_male",
    "race_white_nh",
    "race_hispanic",
    "any_leisure_pa",
    "binge_drink",
    "obese",
    "fairpoor_health",
    "frequent_mental_distress",
    "heartattack_ever",
    "stroke_ever",
    "lungdisease_ever",
    "asthma3_ever",
    "depression_ever",
    "diabetes_any",
]
binary_features = [c for c in binary_features if c in df_clean.columns]

print("Numeric:", numeric_features)
print("Ordinal:", ordinal_features)
print("Binary:", binary_features)


Numeric: ['bmi']
Ordinal: ['_AGEG5YR', 'educ_3cat', 'smoker_current']
Binary: ['sex_male', 'race_white_nh', 'race_hispanic', 'any_leisure_pa', 'binge_drink', 'fairpoor_health', 'frequent_mental_distress', 'heartattack_ever', 'stroke_ever', 'lungdisease_ever', 'asthma3_ever', 'depression_ever', 'diabetes_any']


In [444]:
df_clean.shape

(458172, 20)

In [445]:

feature_groups = {
    "numeric": numeric_features,
    "ordinal": ordinal_features,
    "binary": binary_features,
}

nan_counts = {
    group: df_clean[cols].isna().sum()
    for group, cols in feature_groups.items()
}

for group, counts in nan_counts.items():
    print(f"\nNaN counts for {group} features:")
    print(counts)

total_nan_per_group = {
    group: df_clean[cols].isna().sum().sum()
    for group, cols in feature_groups.items()
}

print(total_nan_per_group)


NaN counts for numeric features:
bmi    29548
dtype: int64

NaN counts for ordinal features:
_AGEG5YR              0
educ_3cat          3570
smoker_current    20584
dtype: int64

NaN counts for binary features:
sex_male                        0
race_white_nh                7542
race_hispanic                7542
any_leisure_pa               1628
binge_drink                 28557
fairpoor_health              1515
frequent_mental_distress     7211
heartattack_ever             2155
stroke_ever                  1260
lungdisease_ever             2330
asthma3_ever                 1414
depression_ever              1922
diabetes_any                  777
dtype: int64
{'numeric': np.int64(29548), 'ordinal': np.int64(24154), 'binary': np.int64(63853)}


In [446]:
for group, cols in feature_groups.items():
    print(f"\n{group.upper()} FEATURES")
    
    # number of NaNs per feature
    nan_counts = df_clean[cols].isna().sum()
    
    # ratio of NaNs per feature
    nan_ratios = df_clean[cols].isna().mean()
    
    # combine into a single DataFrame for display
    summary = pd.DataFrame({
        "num_missing": nan_counts,
        "missing_ratio": nan_ratios
    })
    
    print(summary)



NUMERIC FEATURES
     num_missing  missing_ratio
bmi        29548       0.064491

ORDINAL FEATURES
                num_missing  missing_ratio
_AGEG5YR                  0       0.000000
educ_3cat              3570       0.007792
smoker_current        20584       0.044926

BINARY FEATURES
                          num_missing  missing_ratio
sex_male                            0       0.000000
race_white_nh                    7542       0.016461
race_hispanic                    7542       0.016461
any_leisure_pa                   1628       0.003553
binge_drink                     28557       0.062328
fairpoor_health                  1515       0.003307
frequent_mental_distress         7211       0.015739
heartattack_ever                 2155       0.004703
stroke_ever                      1260       0.002750
lungdisease_ever                 2330       0.005085
asthma3_ever                     1414       0.003086
depression_ever                  1922       0.004195
diabetes_any          

In [447]:
cols_to_filter = ["bmi", "smoker_current", "binge_drink"]

df_filtered = df_clean.dropna(subset=cols_to_filter)

print(df_filtered.shape)


(407198, 20)


In [448]:

feature_groups = {
    "numeric": numeric_features,
    "ordinal": ordinal_features,
    "binary": binary_features,
}

nan_counts = {
    group: df_filtered[cols].isna().sum()
    for group, cols in feature_groups.items()
}

for group, counts in nan_counts.items():
    print(f"\nNaN counts for {group} features:")
    print(counts)

total_nan_per_group = {
    group: df_filtered[cols].isna().sum().sum()
    for group, cols in feature_groups.items()
}

print(total_nan_per_group)


NaN counts for numeric features:
bmi    0
dtype: int64

NaN counts for ordinal features:
_AGEG5YR            0
educ_3cat         683
smoker_current      0
dtype: int64

NaN counts for binary features:
sex_male                       0
race_white_nh               5340
race_hispanic               5340
any_leisure_pa               755
binge_drink                    0
fairpoor_health             1215
frequent_mental_distress    5698
heartattack_ever            1714
stroke_ever                  989
lungdisease_ever            1865
asthma3_ever                1102
depression_ever             1453
diabetes_any                 522
dtype: int64
{'numeric': np.int64(0), 'ordinal': np.int64(683), 'binary': np.int64(25993)}


In [449]:
for group, cols in feature_groups.items():
    print(f"\n {group.upper()} FEATURES")
    
    # number of NaNs per feature
    nan_counts = df_filtered[cols].isna().sum()
    
    # ratio of NaNs per feature
    nan_ratios = df_filtered[cols].isna().mean()
    
    # combine into a single DataFrame for display
    summary = pd.DataFrame({
        "num_missing": nan_counts,
        "missing_ratio": nan_ratios
    })
    
    print(summary)



 NUMERIC FEATURES
     num_missing  missing_ratio
bmi            0            0.0

 ORDINAL FEATURES
                num_missing  missing_ratio
_AGEG5YR                  0       0.000000
educ_3cat               683       0.001677
smoker_current            0       0.000000

 BINARY FEATURES
                          num_missing  missing_ratio
sex_male                            0       0.000000
race_white_nh                    5340       0.013114
race_hispanic                    5340       0.013114
any_leisure_pa                    755       0.001854
binge_drink                         0       0.000000
fairpoor_health                  1215       0.002984
frequent_mental_distress         5698       0.013993
heartattack_ever                 1714       0.004209
stroke_ever                       989       0.002429
lungdisease_ever                 1865       0.004580
asthma3_ever                     1102       0.002706
depression_ever                  1453       0.003568
diabetes_any       

In [450]:
df_filtered.shape

(407198, 20)

### Modeling

In [451]:
target = "short_sleep"

print(df_filtered[target].value_counts())

df_model = df_filtered[df_filtered[target].notna()].copy()

X = df_model.drop(columns=[target])
y = df_model[target]


X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.25,
    stratify=y,
    random_state=42
)


short_sleep
0.0    278988
1.0    128210
Name: count, dtype: int64


In [452]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")), # give the numeric values a common value if missing
    ("scaler", StandardScaler()),
])

ordinal_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")), #give the ordinal values a common value if missing
    ("scaler", StandardScaler()),
])

binary_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")), #give the binary values the most common value if missing 
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("ord", ordinal_transformer, ordinal_features),
        ("bin", binary_transformer, binary_features),
    ]
)


In [453]:
log_reg_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(
        max_iter=500,
        class_weight="balanced",  # helps with class imbalance
        n_jobs=-1
    )),
])

log_reg_pipeline.fit(X_train, y_train)
y_pred_lr = log_reg_pipeline.predict(X_val)

print("Logistic Regression")
print(classification_report(y_val, y_pred_lr))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred_lr))


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


Logistic Regression
              precision    recall  f1-score   support

         0.0       0.76      0.67      0.71     69747
         1.0       0.43      0.53      0.47     32053

    accuracy                           0.63    101800
   macro avg       0.59      0.60      0.59    101800
weighted avg       0.65      0.63      0.64    101800

Confusion matrix:
 [[46604 23143]
 [14937 17116]]


  ret = a @ b
  ret = a @ b
  ret = a @ b


In [454]:
rf_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        max_features= 'sqrt',
        class_weight="balanced",  # also helps with imbalance
        random_state=42,
        n_jobs=-1
    )),
])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_val)

print("Random Forest")
print(classification_report(y_val, y_pred_rf))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred_rf))


Random Forest
              precision    recall  f1-score   support

         0.0       0.71      0.74      0.73     69747
         1.0       0.38      0.34      0.35     32053

    accuracy                           0.61    101800
   macro avg       0.54      0.54      0.54    101800
weighted avg       0.60      0.61      0.61    101800

Confusion matrix:
 [[51852 17895]
 [21298 10755]]


In [455]:
rf_model = rf_pipeline.named_steps["clf"]
encoded_feature_names = rf_pipeline.named_steps["preprocess"].get_feature_names_out()
importances = rf_model.feature_importances_

feat_imp = (
    pd.DataFrame({"feature": encoded_feature_names, "importance": importances})
      .sort_values("importance", ascending=False)
)

feat_imp.head(20)


Unnamed: 0,feature,importance
0,num__bmi,0.639661
1,ord___AGEG5YR,0.11865
3,ord__smoker_current,0.0379
2,ord__educ_3cat,0.026949
4,bin__sex_male,0.018745
10,bin__frequent_mental_distress,0.016864
9,bin__fairpoor_health,0.016315
16,bin__diabetes_any,0.016103
7,bin__any_leisure_pa,0.014782
15,bin__depression_ever,0.014712
