In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# STEP 1: Load data
df = pd.read_excel("/content/Neurology.xlsx", sheet_name="Data")

# STEP 2: Keep only numeric features and extract target
numeric_df = df.select_dtypes(include=[np.number])
X = numeric_df.drop(columns=['Health Behaviors Quartile'])
y = df['Health Behaviors Quartile']

# If target is non-numeric, encode it
if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)

# STEP 3: Impute missing values with -1 AND add missingness indicators
# 3a. Create missing-indicator DataFrame
missing_ind = X.isna().astype(int).add_suffix('_missing')

# 3b. Impute all NaNs with -1
imputer = SimpleImputer(strategy='constant', fill_value=-1)
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)

# 3c. Concatenate indicators
X_aug = pd.concat([X_imputed, missing_ind], axis=1)

# STEP 4: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_aug)

# STEP 5: PCA (retain 95% of variance)
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# STEP 6: Train/Test split (stratify to preserve class proportions)
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, random_state=42, stratify=y
)

# STEP 7: SMOTE on training set only
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# STEP 8: Define models
models = {
    "Logistic Regression": LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=1000,
        class_weight='balanced',
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# STEP 9: Train, predict, and evaluate
def evaluate(name, model, X_tr, y_tr, X_te, y_te):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_te, y_pred))
    print("Classification Report:\n", classification_report(y_te, y_pred, digits=3))
    print("Confusion Matrix:\n", confusion_matrix(y_te, y_pred))

for name, mdl in models.items():
    evaluate(name, mdl, X_train_res, y_train_res, X_test, y_test)






=== Logistic Regression ===
Accuracy: 0.7369714847590954
Classification Report:
               precision    recall  f1-score   support

           1      0.791     0.741     0.765       506
           2      0.382     0.479     0.425       374
           3      0.866     0.819     0.842      1154

    accuracy                          0.737      2034
   macro avg      0.680     0.680     0.677      2034
weighted avg      0.758     0.737     0.746      2034

Confusion Matrix:
 [[375 105  26]
 [ 75 179 120]
 [ 24 185 945]]

=== Random Forest ===
Accuracy: 0.9124877089478859
Classification Report:
               precision    recall  f1-score   support

           1      0.940     0.925     0.932       506
           2      0.797     0.765     0.780       374
           3      0.936     0.955     0.946      1154

    accuracy                          0.912      2034
   macro avg      0.891     0.882     0.886      2034
weighted avg      0.911     0.912     0.912      2034

Confusion Matri

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import statsmodels.api as sm

# STEP 1: Load data
df = pd.read_excel("/content/Neurology.xlsx", sheet_name="Data")

# STEP 2: Keep only numeric features and extract binary target 'Health Ranking'
numeric_df = df.select_dtypes(include=[np.number])
X = numeric_df.drop(columns=['Health Ranking'])
y = df['Health Ranking']

# STEP 3: Impute missing values with -1 AND add missingness indicators
missing_ind = X.isna().astype(int).add_suffix('_missing')
imputer = SimpleImputer(strategy='constant', fill_value=-1)
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)
X_aug = pd.concat([X_imputed, missing_ind], axis=1)

# STEP 4: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_aug)
X_scaled_df = pd.DataFrame(X_scaled, columns=X_aug.columns)
# Drop highly correlated features before statsmodels
def drop_high_corr_features(df, threshold=0.99):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return df.drop(columns=to_drop)

X_scaled_df_dedup = drop_high_corr_features(X_scaled_df)
X_scaled_df_dedup = X_scaled_df_dedup.loc[:, X_scaled_df_dedup.std() > 1e-6]  # drop constant columns

# Add constant for intercept and fit model
X_sm = sm.add_constant(X_scaled_df_dedup)
logit_model = sm.Logit(y, X_sm).fit()
print("\n=== Logistic Regression Summary (statsmodels) ===")
print(logit_model.summary())


# STEP 5: PCA (retain 95% of variance)
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# STEP 6: Train/Test split (stratify to preserve class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, random_state=42, stratify=y
)

# STEP 7: Apply SMOTE only to training set
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# STEP 8: Define models
models = {
    "Logistic Regression": LogisticRegression(
        solver='liblinear',
        class_weight='balanced',
        max_iter=1000,
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# STEP 9: Train, predict, and evaluate
def evaluate(name, model, X_tr, y_tr, X_te, y_te):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_te, y_pred))
    print("Classification Report:\n", classification_report(y_te, y_pred, digits=3))
    print("Confusion Matrix:\n", confusion_matrix(y_te, y_pred))

for name, mdl in models.items():
    evaluate(name, mdl, X_train_res, y_train_res, X_test, y_test)



Optimization terminated successfully.
         Current function value: 0.009790
         Iterations 14

=== Logistic Regression Summary (statsmodels) ===
                           Logit Regression Results                           
Dep. Variable:         Health Ranking   No. Observations:                 2505
Model:                          Logit   Df Residuals:                     2475
Method:                           MLE   Df Model:                           29
Date:                Fri, 11 Apr 2025   Pseudo R-squ.:                  0.9858
Time:                        22:59:37   Log-Likelihood:                -24.524
converged:                       True   LL-Null:                       -1731.3
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                           coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import statsmodels.api as sm
# STEP 1: Load data
df = pd.read_excel("/content/Neurology.xlsx", sheet_name="Data")

# STEP 2: Keep only numeric features and extract binary target 'Health Ranking'
numeric_df = df.select_dtypes(include=[np.number])


# Drop 'Health Ranking' from features, and also drop 'Health Quartile' and 'FIPS_Code' if they exist
X = numeric_df.drop(columns=['Health Ranking'])
for col in ['Health Quartile', 'FIPS_Code']:
    if col in X.columns:
        X = X.drop(columns=[col])

y = df['Health Ranking']

# STEP 3: Impute missing values with -1 AND add missingness indicators
missing_ind = X.isna().astype(int).add_suffix('_missing')
imputer = SimpleImputer(strategy='constant', fill_value=-1)
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)
X_aug = pd.concat([X_imputed, missing_ind], axis=1)

# STEP 4: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_aug)
X_scaled_df = pd.DataFrame(X_scaled, columns=X_aug.columns)

# Drop highly correlated features before statsmodels
def drop_high_corr_features(df, threshold=0.99):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return df.drop(columns=to_drop)

X_scaled_df_dedup = drop_high_corr_features(X_scaled_df)
X_scaled_df_dedup = X_scaled_df_dedup.loc[:, X_scaled_df_dedup.std() > 1e-6]  # drop near-constant columns

# STEP 4.5: Statsmodels logistic regression with p-value filtering
X_sm = sm.add_constant(X_scaled_df_dedup)
logit_model = sm.Logit(y, X_sm).fit()
print("\n=== Full Logistic Regression Summary ===")
print(logit_model.summary())

# Filter variables with p-values <= 0.05
significant_vars = logit_model.pvalues[logit_model.pvalues <= 0.05].index
X_sm_significant = X_sm[significant_vars]

# Refit model using only significant features
logit_model_significant = sm.Logit(y, X_sm_significant).fit()
print("\n=== Logistic Regression Summary (Only p ≤ 0.05 Variables) ===")
print(logit_model_significant.summary())

# STEP 5: PCA (retain 95% of variance)
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# STEP 6: Train/Test split (stratify to preserve class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, random_state=42, stratify=y
)

# STEP 8: Define models
models = {
    "Logistic Regression": LogisticRegression(
        solver='liblinear',
        class_weight='balanced',
        max_iter=1000,
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# STEP 9: Train, predict, and evaluate
def evaluate(name, model, X_tr, y_tr, X_te, y_te):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_te, y_pred))
    print("Classification Report:\n", classification_report(y_te, y_pred, digits=3))
    print("Confusion Matrix:\n", confusion_matrix(y_te, y_pred))

for name, mdl in models.items():
    evaluate(name, mdl, X_train, y_train, X_test, y_test)


Optimization terminated successfully.
         Current function value: 0.469331
         Iterations 9

=== Full Logistic Regression Summary ===
                           Logit Regression Results                           
Dep. Variable:         Health Ranking   No. Observations:                 2505
Model:                          Logit   Df Residuals:                     2477
Method:                           MLE   Df Model:                           27
Date:                Thu, 17 Apr 2025   Pseudo R-squ.:                  0.3209
Time:                        13:26:38   Log-Likelihood:                -1175.7
converged:                       True   LL-Null:                       -1731.3
Covariance Type:            nonrobust   LLR p-value:                6.131e-217
                                                           coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------

In [5]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import statsmodels.api as sm

# STEP 1: Load data
df = pd.read_excel("/content/Neurology.xlsx", sheet_name="Data")

# STEP 2: Keep only numeric features and extract binary target 'Health Ranking'
numeric_df = df.select_dtypes(include=[np.number])
X = numeric_df.drop(columns=['Health Ranking'])
for col in ['Health Quartile', 'FIPS_Code']:
    if col in X.columns:
        X = X.drop(columns=[col])
y = df['Health Ranking']


# STEP 3: Impute missing values with -1 AND add missingness indicators
missing_ind = X.isna().astype(int).add_suffix('_missing')
imputer = SimpleImputer(strategy='constant', fill_value=-1)
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)
X_aug = pd.concat([X_imputed, missing_ind], axis=1)

# STEP 3.5: Remove specific columns and all *_missing columns
columns_to_remove = [
    'Mental Health Providers',
    'Average Number of Mentally Unhealthy Days',
    'Preventable Hospitalization Rate_missing',
    'number_of_dual_eligible_users_missing','# Mental Health Providers' ,
'Mental Health Provider Rate', '% Frequent Mental Distress'
]
# Remove listed columns if they exist
X_aug = X_aug.drop(columns=[col for col in columns_to_remove if col in X_aug.columns])

# Remove all *_missing columns
X_aug = X_aug.drop(columns=[col for col in X_aug.columns if col.endswith('_missing')])

# STEP 4: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_aug)
X_scaled_df = pd.DataFrame(X_scaled, columns=X_aug.columns)

# Drop highly correlated features
def drop_high_corr_features(df, threshold=0.99):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return df.drop(columns=to_drop)

X_scaled_df_dedup = drop_high_corr_features(X_scaled_df)
X_scaled_df_dedup = X_scaled_df_dedup.loc[:, X_scaled_df_dedup.std() > 1e-6]  # drop near-constant columns

# STEP 4.5: Backward Elimination Function
def backward_elimination(X, y, sl=0.05):
    X_ = sm.add_constant(X.copy())
    while True:
        model = sm.Logit(y, X_).fit(disp=False)
        p_values = model.pvalues
        max_p = p_values.max()
        if max_p > sl:
            worst_feature = p_values.idxmax()
            print(f"Removing '{worst_feature}' with p = {max_p:.4f}")
            X_ = X_.drop(columns=[worst_feature])
        else:
            break
    final_model = sm.Logit(y, X_).fit()
    return final_model, X_

# STEP 4.6: Apply Backward Elimination
logit_model_optimized, X_selected = backward_elimination(X_scaled_df_dedup, y)

print("\n=== Logistic Regression Summary (Backward Elimination) ===")
print(logit_model_optimized.summary())

# STEP 5: PCA (retain 95% of variance)
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# STEP 6: Train/Test split (stratify to preserve class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, random_state=42, stratify=y
)

# STEP 8: Define models
models = {
    "Logistic Regression": LogisticRegression(
        solver='liblinear',
        class_weight='balanced',
        max_iter=1000,
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# STEP 9: Train, predict, and evaluate
def evaluate(name, model, X_tr, y_tr, X_te, y_te):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_te, y_pred))
    print("Classification Report:\n", classification_report(y_te, y_pred, digits=3))
    print("Confusion Matrix:\n", confusion_matrix(y_te, y_pred))

for name, mdl in models.items():
    evaluate(name, mdl, X_train, y_train, X_test, y_test)



Removing 'percentage_of_users_out_of_ffs_beneficiaries' with p = 0.8918
Removing 'Population' with p = 0.6083
Removing 'percentage_of_dual_eligible_users_out_of_total_users' with p = 0.2878
Removing 'number_of_providers' with p = 0.0728
Optimization terminated successfully.
         Current function value: 0.503256
         Iterations 9

=== Logistic Regression Summary (Backward Elimination) ===
                           Logit Regression Results                           
Dep. Variable:         Health Ranking   No. Observations:                 2505
Model:                          Logit   Df Residuals:                     2490
Method:                           MLE   Df Model:                           14
Date:                Thu, 17 Apr 2025   Pseudo R-squ.:                  0.2718
Time:                        16:57:27   Log-Likelihood:                -1260.7
converged:                       True   LL-Null:                       -1731.3
Covariance Type:            nonrobust   LLR p-va