# Alzheimer's Prediction using MMSE Features
This notebook runs logistic regression with bootstrapping on MMSCORE and MMSE questions to predict Alzheimer's Disease (AD).

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from Logistic_bootstrap_metrics import *
from statsmodels.stats.outliers_influence import variance_inflation_factor


## Load and Prepare Data

In [52]:
# Load dataset
df = pd.read_csv("data/TrainTest_Table.csv")

# Split into train and test based on 'Split' column
df_train = df[df['Split'] == 'Train']
df_test = df[df['Split'] == 'Test']

# Define MMSE question columns
mmse_cols = [
    # A. Orientation
    "MMYEAR", "MMMONTH", "MMDAY", "MMSEASON", "MMDATE",
    "MMSTATE", "MMCITY", "MMAREA", "MMHOSPIT", "MMFLOOR",

    # B. Registration
    "WORD1", "WORD2", "WORD3",

    # C. Attention and Calculation – Serial 7s only
    "MMD", "MML", "MMR", "MMO", "MMW",

    # D. Recall
    "WORD1DL", "WORD2DL", "WORD3DL",

    # E. Language
    "MMWATCH", "MMPENCIL", "MMREPEAT",
    "MMHAND", "MMFOLD", "MMONFLR",
    "MMREAD", "MMWRITE", "MMDRAW"
]

## Logistic Regression on MMSCORE with Bootstrapping

In [53]:
# Define features and target
X_train = df_train[['MMSCORE']]
y_train = df_train['AD']
X_test = df_test[['MMSCORE']]
y_test = df_test['AD']

# Run bootstrapped logistic regression
results = bootstrap_metrics(
    df_train=X_train,
    df_test=X_test,
    independent_vars=['MMSCORE'],
    dep_var_train=y_train,
    dep_var_test=y_test,
    n_bootstrap=1000,
    threshold=0.5
)

# Display results
print("\n===== Odds Ratios (one-time fit) — MMSE Score =====")
print(results["Odds Ratios (one-time fit)"])

print("\n===== Bootstrapped Metrics (1000 runs) - MMSE Score =====")
for metric, (mean, ci) in results["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")


===== Odds Ratios (one-time fit) — MMSE Score =====
           Odds Ratio   95% CI Lower  95% CI Upper       p-value
const    1.108315e+07  584766.636652  2.100603e+08  3.205904e-27
MMSCORE  5.208992e-01       0.467053  5.809535e-01  1.066395e-31

===== Bootstrapped Metrics (1000 runs) - MMSE Score =====
AUC: Mean = 0.7946, 95% CI = [0.7893, 0.8066]
Balanced Accuracy: Mean = 0.6498, 95% CI = [0.6437, 0.6546]
Sensitivity: Mean = 0.3209, 95% CI = [0.3091, 0.3273]
Specificity: Mean = 0.9787, 95% CI = [0.9783, 0.9819]
F1-score: Mean = 0.4493, 95% CI = [0.4359, 0.4615]


## Logistic Regression on MMSE Questions with Bootstrapping

In [54]:
# Define features and target
X_train_mmse = df_train[mmse_cols]
y_train_mmse = df_train['AD']
X_test_mmse = df_test[mmse_cols]
y_test_mmse = df_test['AD']

def reduce_multicollinearity(df, threshold=10):
    """
    Removes variables with high VIF one by one until all remaining have VIF < threshold
    """
    while True:
        vif = pd.Series(
            [variance_inflation_factor(df.values, i) for i in range(df.shape[1])],
            index=df.columns
        )
        max_vif = vif.max()
        if max_vif > threshold:
            drop_col = vif.idxmax()
            print(f"Dropping '{drop_col}' due to high VIF: {max_vif:.2f}")
            df = df.drop(columns=[drop_col])
        else:
            break
    return df

# Step 1: Remove constant columns
X_train_mmse = X_train_mmse.loc[:, X_train_mmse.nunique() > 1]
X_test_mmse = X_test_mmse[X_train_mmse.columns]

# Step 2: Remove perfect correlations
corr_matrix = X_train_mmse.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] == 1.0)]
X_train_mmse = X_train_mmse.drop(columns=to_drop)
X_test_mmse = X_test_mmse[X_train_mmse.columns]

# Step 3: Reduce multicollinearity using VIF
X_train_mmse = reduce_multicollinearity(X_train_mmse, threshold=50)
X_test_mmse = X_test_mmse[X_train_mmse.columns]
# Run bootstrapped logistic regression on MMSE questions
results_mmse = bootstrap_metrics(
    df_train=X_train_mmse,
    df_test=X_test_mmse,
    independent_vars=X_train_mmse.columns.tolist(),
    dep_var_train=y_train_mmse,
    dep_var_test=y_test_mmse,
    n_bootstrap=1000,
    threshold=0.5
)

# Show odds ratios
print("\n===== Odds Ratios (one-time fit) — MMSE Questions =====")
print(results_mmse["Odds Ratios (one-time fit)"])

# Show bootstrapped metrics
print("\n===== Bootstrapped Metrics (1000 runs) — MMSE Questions =====")
for metric, (mean, ci) in results_mmse["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")

Dropping 'MMSTATE' due to high VIF: 602.54
Dropping 'MMWATCH' due to high VIF: 386.89
Dropping 'MMPENCIL' due to high VIF: 322.79
Dropping 'WORD3' due to high VIF: 299.38
Dropping 'MMD' due to high VIF: 263.78
Dropping 'MMREAD' due to high VIF: 206.67
Dropping 'MMFOLD' due to high VIF: 184.74
Dropping 'WORD2' due to high VIF: 167.67
Dropping 'MMO' due to high VIF: 126.43
Dropping 'MMCITY' due to high VIF: 95.22
Dropping 'WORD1' due to high VIF: 75.93
Dropping 'MMONFLR' due to high VIF: 63.52
Dropping 'MMW' due to high VIF: 59.63

===== Odds Ratios (one-time fit) — MMSE Questions =====
           Odds Ratio  95% CI Lower  95% CI Upper       p-value
const     2874.301656    147.076360  56172.249455  1.515220e-07
MMYEAR       0.545725      0.143876      2.069948  3.732575e-01
MMMONTH      0.359933      0.114730      1.129192  7.982908e-02
MMDAY        0.695798      0.286115      1.692101  4.237520e-01
MMSEASON     0.441816      0.232747      0.838686  1.249263e-02
MMDATE       0.244242   