In [14]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from Logistic_bootstrap_metrics import bootstrap_metrics


In [15]:
# Load your dataset
df = pd.read_csv("data/TrainTest_Table.csv")

# Ensure MMSCORE and Group are available and not missing
df = df.dropna(subset=['MMSCORE', 'AD'])

# Show sample
df[['MMSCORE', 'AD', 'Split']].head()


Unnamed: 0,MMSCORE,AD,Split
0,30.0,0,Train
1,30.0,0,Train
2,29.0,0,Train
3,30.0,0,Train
4,28.0,0,Test


In [16]:
# Split into train and test based on 'Split' column
df_train = df[df['Split'] == 'Train']
df_test = df[df['Split'] == 'Test']

# Define features and target
X_train = df_train[['MMSCORE']]
y_train = df_train['AD']
X_test = df_test[['MMSCORE']]
y_test = df_test['AD']


In [17]:
# Run logistic regression and bootstrap
results = bootstrap_metrics(
    df_train=X_train,
    df_test=X_test,
    independent_vars=['MMSCORE'],
    dep_var_train=y_train,
    dep_var_test=y_test,
    n_bootstrap=1000,
    threshold=0.5
)


In [20]:
# Show odds ratios
print("\n===== Odds Ratios (one-time fit) =====")
print(results["Odds Ratios (one-time fit)"])

# Show bootstrapped metrics
print("\n===== Bootstrapped Metrics (1000 runs) =====")
for metric, (mean, ci) in results["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")



===== Odds Ratios (one-time fit) =====
           Odds Ratio   95% CI Lower  95% CI Upper       p-value
const    1.108315e+07  584766.636652  2.100603e+08  3.205904e-27
MMSCORE  5.208992e-01       0.467053  5.809535e-01  1.066395e-31

===== Bootstrapped Metrics (1000 runs) =====
AUC: Mean = 0.7946, 95% CI = [0.7893, 0.8066]
Balanced Accuracy: Mean = 0.6498, 95% CI = [0.6437, 0.6546]
Sensitivity: Mean = 0.3209, 95% CI = [0.3091, 0.3273]
Specificity: Mean = 0.9787, 95% CI = [0.9783, 0.9819]
F1-score: Mean = 0.4493, 95% CI = [0.4359, 0.4615]


In [19]:
# Define MMSE columns
mmse_columns = [
    "MMYEAR", "MMMONTH", "MMDAY", "MMSEASON", "MMDATE",
    "MMSTATE", "MMCITY", "MMAREA", "MMHOSPIT", "MMFLOOR",
    "WORD1", "WORD2", "WORD3",
    "MMD", "MML", "MMR", "MMO", "MMW",
    "WORD1DL", "WORD2DL", "WORD3DL",
    "MMWATCH", "MMPENCIL", "MMREPEAT",
    "MMHAND", "MMFOLD", "MMONFLR",
    "MMREAD", "MMWRITE", "MMDRAW"
]

# Drop rows with any missing MMSE or AD values
df_mmse = df.dropna(subset=mmse_columns + ['AD'])

# Split into train and test
df_train_mmse = df_mmse[df_mmse['Split'] == 'Train']
df_test_mmse = df_mmse[df_mmse['Split'] == 'Test']

# Define X and y
X_train_mmse = df_train_mmse[mmse_columns]
y_train_mmse = df_train_mmse['AD']
X_test_mmse = df_test_mmse[mmse_columns]
y_test_mmse = df_test_mmse['AD']

# Step 1: Drop constant columns
X_train_mmse = X_train_mmse.loc[:, X_train_mmse.nunique() > 1]
X_test_mmse = X_test_mmse[X_train_mmse.columns]

# Step 2: Drop collinear columns based on matrix rank
X_rank = np.linalg.matrix_rank(X_train_mmse)
while X_rank < X_train_mmse.shape[1]:
    corr_matrix = X_train_mmse.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = upper.stack().idxmax()  # Most collinear pair
    X_train_mmse = X_train_mmse.drop(columns=[to_drop[1]])
    X_test_mmse = X_test_mmse.drop(columns=[to_drop[1]])
    X_rank = np.linalg.matrix_rank(X_train_mmse)

# Run regression
results_mmse = bootstrap_metrics(
    df_train=X_train_mmse,
    df_test=X_test_mmse,
    independent_vars=X_train_mmse.columns.tolist(),
    dep_var_train=y_train_mmse,
    dep_var_test=y_test_mmse,
    n_bootstrap=1000,
    threshold=0.5
)

# Show results
print("\n===== MMSE Questions - Odds Ratios (one-time fit) =====")
print(results_mmse["Odds Ratios (one-time fit)"])

print("\n===== MMSE Questions - Bootstrapped Metrics (1000 runs) =====")
for metric, (mean, ci) in results_mmse["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")


LinAlgError: Singular matrix