In [1]:
import pandas as pd
import polars as pl

# Set options to display all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [2]:
data_path = "./data/meps_2023/"
main_df = "h251.xlsx"

In [3]:
# reading file usng polars for faster speed
df_pl1 = pl.read_excel(f"%s%s" % (data_path, main_df))
# print(df_pl1.columns)

In [4]:
keep_cols = [
    # --- Identifiers ---
    "DUPERSID",
    # --- Demographics & SES ---
    "AGELAST",
    "SEX",
    "RACEV1X",
    "RACETHX",
    "HISPANX",
    "EDUCYR",
    "FAMINC23",
    "POVCAT23",
    "INSCOV23",
    "INSURC23",
    "REGION23",
    # --- Nativity / Language (optional but useful) ---
    "BORNUSA",
    "YRSINUS",
    # --- Health status ---
    "RTHLTH53",
    "MNHLTH53",
    "ADSMOK42",
    # --- Chronic conditions ---
    "HIBPDX",
    "CHDDX",
    "ASTHDX",
    "DIABDX_M18",
    # --- Utilization ---
    "OBTOTV23",
    "OPTOTV23",
    "ERTOT23",
    "DVTOT23",
    "TOTEXP23",
    # --- Hospitalization indicators / target sources ---
    "IPDIS23",
    "IPTEXP23",
    "IPNGTD23",
]

In [5]:
rename_map = {
    # Identifiers
    "DUPERSID": "person_id",
    # Demographics & SES
    "AGELAST": "age",
    "SEX": "sex",
    "RACEV1X": "race_simple",
    "RACETHX": "race_ethnicity",
    "HISPANX": "hispanic",
    "EDUCYR": "education_years",
    "FAMINC23": "family_income",
    "POVCAT23": "poverty_category",
    "INSCOV23": "insurance_coverage",
    "INSURC23": "insurance_category",
    "REGION23": "region",
    # Nativity / Language
    "BORNUSA": "born_in_usa",
    "YRSINUS": "years_in_us",
    # Health status
    "RTHLTH53": "self_rated_health",
    "MNHLTH53": "self_rated_mental_health",
    "ADSMOK42": "smoker",
    # Chronic conditions
    "HIBPDX": "hypertension_dx",
    "CHDDX": "coronary_hd_dx",
    "ASTHDX": "asthma_dx",
    "DIABDX_M18": "diabetes_dx",
    # Utilization
    "OBTOTV23": "office_visits",
    "OPTOTV23": "outpatient_visits",
    "ERTOT23": "er_visits",
    "DVTOT23": "total_visits",
    "TOTEXP23": "total_expenditures",
    # Hospitalization indicators / sources
    "IPDIS23": "inpatient_discharges",
    "IPTEXP23": "inpatient_expenditures",
    "IPNGTD23": "inpatient_nights",
}

In [6]:
df_pl = df_pl1[keep_cols]
df_pl = df_pl.rename(rename_map)

display(df_pl.head())

person_id,age,sex,race_simple,race_ethnicity,hispanic,education_years,family_income,poverty_category,insurance_coverage,insurance_category,region,born_in_usa,years_in_us,self_rated_health,self_rated_mental_health,smoker,hypertension_dx,coronary_hd_dx,asthma_dx,diabetes_dx,office_visits,outpatient_visits,er_visits,total_visits,total_expenditures,inpatient_discharges,inpatient_expenditures,inpatient_nights
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""2790002101""",58,2,2,3,2,17,130700,5,1,1,2,1,-1,4,3,2,2,2,2,1,3,1,0,0,646,0,0,0
"""2790002102""",27,1,2,3,2,12,130700,5,1,1,2,1,-1,2,2,-1,2,2,1,2,1,0,0,2,1894,0,0,0
"""2790004101""",49,2,1,2,2,17,87000,5,1,1,2,1,-1,1,1,2,2,2,2,2,1,0,0,1,986,0,0,0
"""2790006101""",75,2,1,2,2,12,38000,4,2,4,2,1,-1,2,2,1,1,2,2,1,3,0,0,0,1312,0,0,0
"""2790006102""",23,1,1,2,2,11,38000,4,2,2,2,1,-1,2,2,-1,2,2,2,2,0,0,0,0,0,0,0,0


In [7]:
df_pd = df_pl.to_pandas()

In [8]:
categorical_cols = [
    "sex",
    "race_simple",
    "race_ethnicity",
    "hispanic",
    "poverty_category",
    "insurance_coverage",
    "insurance_category",
    "region",
    "born_in_usa",
    "self_rated_health",
    "self_rated_mental_health",
    "smoker",
    "hypertension_dx",
    "coronary_hd_dx",
    "asthma_dx",
    "diabetes_dx",
]

In [9]:
df_ohe_pd = pd.get_dummies(
    df_pd,
    columns=categorical_cols,
    drop_first=False,
)

In [10]:
df_ohe_pd["hospitalized"] = (df_ohe_pd["inpatient_expenditures"] > 0).astype(int)

In [11]:
print(df_ohe_pd["hospitalized"].value_counts(normalize=True))

hospitalized
0    0.926265
1    0.073735
Name: proportion, dtype: float64


In [12]:
df_ohe_pd.shape
print(df_ohe_pd.head(1))

    person_id  age  education_years  family_income  years_in_us  \
0  2790002101   58               17         130700           -1   

   office_visits  outpatient_visits  er_visits  total_visits  \
0              3                  1          0             0   

   total_expenditures  inpatient_discharges  inpatient_expenditures  \
0                 646                     0                       0   

   inpatient_nights  sex_1  sex_2  race_simple_1  race_simple_2  \
0                 0  False   True          False           True   

   race_simple_3  race_simple_4  race_simple_6  race_ethnicity_1  \
0          False          False          False             False   

   race_ethnicity_2  race_ethnicity_3  race_ethnicity_4  race_ethnicity_5  \
0             False              True             False             False   

   hispanic_1  hispanic_2  poverty_category_1  poverty_category_2  \
0       False        True               False               False   

   poverty_category_3  pove

In [13]:
df_ohe_pd.info()
df_ohe_pd.describe().T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18919 entries, 0 to 18918
Data columns (total 94 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   person_id                    18919 non-null  object
 1   age                          18919 non-null  int64 
 2   education_years              18919 non-null  int64 
 3   family_income                18919 non-null  int64 
 4   years_in_us                  18919 non-null  int64 
 5   office_visits                18919 non-null  int64 
 6   outpatient_visits            18919 non-null  int64 
 7   er_visits                    18919 non-null  int64 
 8   total_visits                 18919 non-null  int64 
 9   total_expenditures           18919 non-null  int64 
 10  inpatient_discharges         18919 non-null  int64 
 11  inpatient_expenditures       18919 non-null  int64 
 12  inpatient_nights             18919 non-null  int64 
 13  sex_1                        18

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,18919.0,43.716581,23.93955,0.0,23.0,45.0,64.0,85.0
education_years,18919.0,11.218405,5.497385,-8.0,9.0,12.0,16.0,17.0
family_income,18919.0,98815.895555,91575.419638,-230.0,35000.0,72800.0,137484.0,747346.0
years_in_us,18919.0,-0.148528,2.087025,-8.0,-1.0,-1.0,-1.0,5.0
office_visits,18919.0,7.140758,13.784344,0.0,0.0,2.0,8.0,419.0
outpatient_visits,18919.0,1.095195,4.489569,0.0,0.0,0.0,1.0,178.0
er_visits,18919.0,0.224166,0.672914,0.0,0.0,0.0,0.0,22.0
total_visits,18919.0,1.035943,1.688402,0.0,0.0,0.0,2.0,25.0
total_expenditures,18919.0,8422.054125,21664.25047,0.0,299.5,1816.0,7087.0,574675.0
inpatient_discharges,18919.0,0.096728,0.394552,0.0,0.0,0.0,0.0,10.0


In [14]:
corr = df_ohe_pd.corr(numeric_only=True)["hospitalized"].sort_values(ascending=False)
print(corr)

hospitalized                   1.000000
inpatient_discharges           0.865864
inpatient_expenditures         0.562237
total_expenditures             0.451419
inpatient_nights               0.444087
er_visits                      0.416078
age                            0.175374
hypertension_dx_1              0.164760
coronary_hd_dx_1               0.155536
self_rated_mental_health_-1    0.150579
self_rated_health_-1           0.150579
outpatient_visits              0.146469
office_visits                  0.141957
diabetes_dx_1                  0.126824
region_-1                      0.117282
self_rated_health_5            0.115014
insurance_category_4           0.109697
self_rated_health_4            0.107225
insurance_coverage_2           0.106903
insurance_category_6           0.103033
insurance_category_5           0.068628
self_rated_mental_health_4     0.064495
race_ethnicity_2               0.050264
asthma_dx_1                    0.049784
education_years                0.048283


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report

# 1. Define X and y
cols_to_drop = [
    "hospitalized",
    "person_id",
    "inpatient_expenditures",
    "inpatient_nights",
    "inpatient_discharges",
]

X = df_ohe_pd.drop(columns=[c for c in cols_to_drop if c in df_ohe_pd.columns])
y = df_ohe_pd["hospitalized"]

# 2. Train/test split (stratified due to imbalance)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y,
)

# 3. Baseline logistic regression with class weights (handles imbalance)
log_clf = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1,
)

log_clf.fit(X_train, y_train)

# 4. Evaluate
y_pred = log_clf.predict(X_test)
y_proba = log_clf.predict_proba(X_test)[:, 1]

print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

ROC-AUC: 0.9511513967909093
              precision    recall  f1-score   support

           0       0.99      0.90      0.94      4381
           1       0.41      0.89      0.56       349

    accuracy                           0.90      4730
   macro avg       0.70      0.89      0.75      4730
weighted avg       0.95      0.90      0.91      4730



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
from sklearn.metrics import precision_recall_curve

prec, rec, thresh = precision_recall_curve(y_test, y_proba)

In [21]:
# Convert the clean Polars table (with renamed columns) to pandas
df_meta = df_pl.select(
    [
        "person_id",
        "age",
        "sex",
        "race_ethnicity",
        "hispanic",
        "poverty_category",
        "insurance_coverage",
        "family_income",
        "self_rated_health",
        "self_rated_mental_health",
        "hospitalized",
    ]
).to_pandas()

ColumnNotFoundError: unable to find column "hospitalized"; valid columns: ["person_id", "age", "sex", "race_simple", "race_ethnicity", "hispanic", "education_years", "family_income", "poverty_category", "insurance_coverage", "insurance_category", "region", "born_in_usa", "years_in_us", "self_rated_health", "self_rated_mental_health", "smoker", "hypertension_dx", "coronary_hd_dx", "asthma_dx", "diabetes_dx", "office_visits", "outpatient_visits", "er_visits", "total_visits", "total_expenditures", "inpatient_discharges", "inpatient_expenditures", "inpatient_nights"]

In [None]:
# Attach predictions to the fairness meta table
df_meta["pred_prob"] = log_clf.predict_proba(X)[:, 1]
df_meta["pred_label"] = log_clf.predict(X)