# Alzheimer's Prediction using MMSE Features
This notebook runs logistic regression with bootstrapping on MMSCORE and MMSE questions to predict Alzheimer's Disease (AD).

In [103]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from Logistic_bootstrap_metrics import bootstrap_metrics

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score


## Load and Prepare Data

In [104]:
# Load dataset
df = pd.read_csv("data/TrainTest_Table.csv")

# Split into train and test based on 'Split' column
df_train = df[df['Split'] == 'Train']
df_test = df[df['Split'] == 'Test']

# Define MMSE question columns
mmse_cols = [
    # A. Orientation
    "MMYEAR", "MMMONTH", "MMDAY", "MMSEASON", "MMDATE",
    "MMSTATE", "MMCITY", "MMAREA", "MMHOSPIT", "MMFLOOR",

    # B. Registration
    "WORD1", "WORD2", "WORD3",

    # C. Attention and Calculation – Serial 7s only
    "MMD", "MML", "MMR", "MMO", "MMW",

    # D. Recall
    "WORD1DL", "WORD2DL", "WORD3DL",

    # E. Language
    "MMWATCH", "MMPENCIL", "MMREPEAT",
    "MMHAND", "MMFOLD", "MMONFLR",
    "MMREAD", "MMWRITE", "MMDRAW"
]

# Load the saved dataframes with embeddings
df_prompts_train = pd.read_csv("data/MMSE_Prompts_Train.csv")
df_prompts_test = pd.read_csv("data/MMSE_Prompts_Test.csv")


df_context_promts_train = pd.read_csv("data/MMSE_Context_Prompts_Train.csv")
df_context_promts_test = pd.read_csv("data/MMSE_Context_Prompts_Test.csv")


## Logistic Regression on MMSCORE with Bootstrapping

In [105]:
# Define features and target
X_train = df_train[['MMSCORE']]
y_train = df_train['AD']
X_test = df_test[['MMSCORE']]
y_test = df_test['AD']

# Run bootstrapped logistic regression
results = bootstrap_metrics(
    df_train=X_train,
    df_test=X_test,
    independent_vars=['MMSCORE'],
    dep_var_train=y_train,
    dep_var_test=y_test,
    n_bootstrap=1000,
    threshold=0.5
)

# Display results
print("\n===== Odds Ratios (one-time fit) — MMSE Score =====")
print(results["Odds Ratios (one-time fit)"])

print("\n===== Bootstrapped Metrics (1000 runs) - MMSE Score =====")
for metric, (mean, ci) in results["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")


===== Odds Ratios (one-time fit) — MMSE Score =====
           Odds Ratio   95% CI Lower  95% CI Upper       p-value
const    1.108315e+07  584766.636652  2.100603e+08  3.205904e-27
MMSCORE  5.208992e-01       0.467053  5.809535e-01  1.066395e-31

===== Bootstrapped Metrics (1000 runs) - MMSE Score =====
AUC: Mean = 0.7946, 95% CI = [0.7893, 0.8066]
Balanced Accuracy: Mean = 0.6498, 95% CI = [0.6437, 0.6546]
Sensitivity: Mean = 0.3209, 95% CI = [0.3091, 0.3273]
Specificity: Mean = 0.9787, 95% CI = [0.9783, 0.9819]
F1-score: Mean = 0.4493, 95% CI = [0.4359, 0.4615]


## Logistic Regression on MMSE Questions with Bootstrapping

In [106]:
def generate_mmseq_ad_pairs(df, mmse_cols):
    """
    Generates a DataFrame with MMSE question and corresponding AD labels of the same row.
    Each MMSE question is treated as an independent feature.
    Args:
        df (pd.DataFrame): DataFrame containing MMSE questions and AD labels.
        mmse_cols (list): List of MMSE question columns.
    Returns:
        pd.DataFrame: DataFrame with MMSE question results and corresponding AD labels.
    """
    mmse_ad_pairs = []
    for col in mmse_cols:
        for _, row in df.iterrows():
            if pd.notnull(row[col]):
                mmse_ad_pairs.append({
                    'Result': row[col],
                    'AD': row['AD'],
                })
    return pd.DataFrame(mmse_ad_pairs)
# Generate MMSE-AD pairs for training and testing sets
df_train_mmse = generate_mmseq_ad_pairs(df_train, mmse_cols)
df_test_mmse = generate_mmseq_ad_pairs(df_test, mmse_cols)

# Define features and target
X_train_mmse = df_train_mmse[['Result']]
y_train_mmse = df_train_mmse['AD']
X_test_mmse = df_test_mmse[['Result']]
y_test_mmse = df_test_mmse['AD']

# Run bootstrapped logistic regression on MMSE questions
results_mmse = bootstrap_metrics(
    df_train=X_train_mmse,
    df_test=X_test_mmse,
    independent_vars=['Result'],
    dep_var_train=y_train_mmse,
    dep_var_test=y_test_mmse,
    n_bootstrap=1000,
    threshold=0.5
)

# Show odds ratios
print("\n===== Odds Ratios (one-time fit) — MMSE Questions =====")
print(results_mmse["Odds Ratios (one-time fit)"])

# Show bootstrapped metrics
print("\n===== Bootstrapped Metrics (1000 runs) — MMSE Questions =====")
for metric, (mean, ci) in results_mmse["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")


===== Odds Ratios (one-time fit) — MMSE Questions =====
        Odds Ratio  95% CI Lower  95% CI Upper        p-value
const     0.728889      0.659118      0.806046   7.276976e-10
Result    0.246591      0.221570      0.274438  4.657922e-145

===== Bootstrapped Metrics (1000 runs) — MMSE Questions =====
AUC: Mean = 0.5475, 95% CI = [0.5473, 0.5476]
Balanced Accuracy: Mean = 0.5000, 95% CI = [0.5000, 0.5000]
Sensitivity: Mean = 0.0000, 95% CI = [0.0000, 0.0000]
Specificity: Mean = 1.0000, 95% CI = [1.0000, 1.0000]
F1-score: Mean = 0.0000, 95% CI = [0.0000, 0.0000]


## Logistic Regression on Embeddings with Bootstrapping

In [107]:
# Define features and target
embedding_cols = [col for col in df_prompts_train.columns if col.startswith("Embedding_")]
X_train_embeddings = df_prompts_train[embedding_cols].values
X_test_embeddings = df_prompts_test[embedding_cols].values
y_train_embeddings = df_prompts_train['AD'].values
y_test_embeddings = df_prompts_test['AD'].values

# Run bootstrapped logistic regression on embeddings
results = bootstrap_metrics(
    df_train=pd.DataFrame(X_train_embeddings, columns=embedding_cols),
    df_test=pd.DataFrame(X_test_embeddings, columns=embedding_cols),
    independent_vars=embedding_cols,  # Use embedding column names
    dep_var_train=pd.Series(y_train_embeddings),
    dep_var_test=pd.Series(y_test_embeddings),
    n_bootstrap=1000,  # Number of bootstrap iterations
    threshold=0.5      # Classification threshold
)

# Show odds ratios
print("\n===== Odds Ratios (one-time fit) — Embeddings =====")
print(results["Odds Ratios (one-time fit)"])

# Show bootstrapped metrics
print("\n===== Bootstrapped Metrics (1000 runs) — Embeddings =====")
for metric, (mean, ci) in results["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")

  result = getattr(ufunc, method)(*inputs, **kwargs)



===== Odds Ratios (one-time fit) — Embeddings =====
               Odds Ratio  95% CI Lower  95% CI Upper   p-value
const            0.960565           NaN           NaN       NaN
Embedding_0      0.322098           NaN           NaN       NaN
Embedding_1      0.403917           NaN           NaN       NaN
Embedding_2      1.275264           0.0           inf  0.999999
Embedding_3      4.317501           0.0           inf  0.999998
...                   ...           ...           ...       ...
Embedding_763    0.034894           NaN           NaN       NaN
Embedding_764    0.267193           0.0           inf  0.999997
Embedding_765   10.559986           NaN           NaN       NaN
Embedding_766    1.046758           NaN           NaN       NaN
Embedding_767   26.726774           0.0           inf  0.999988

[769 rows x 4 columns]

===== Bootstrapped Metrics (1000 runs) — Embeddings =====
AUC: Mean = 0.5747, 95% CI = [0.5744, 0.5751]
Balanced Accuracy: Mean = 0.5144, 95% CI = [0.5141

## Logistic Regression on Contextual Embeddings with Bootstrapping

In [108]:
# Define features and target
embedding_cols = [col for col in df_context_promts_train.columns if col.startswith("Embedding_")]
X_train_embeddings = df_prompts_train[embedding_cols].values
X_test_embeddings = df_prompts_test[embedding_cols].values
y_train_embeddings = df_prompts_train['AD'].values
y_test_embeddings = df_prompts_test['AD'].values

# Run bootstrapped logistic regression on embeddings
results = bootstrap_metrics(
    df_train=pd.DataFrame(X_train_embeddings, columns=embedding_cols),
    df_test=pd.DataFrame(X_test_embeddings, columns=embedding_cols),
    independent_vars=embedding_cols,  # Use embedding column names
    dep_var_train=pd.Series(y_train_embeddings),
    dep_var_test=pd.Series(y_test_embeddings),
    n_bootstrap=1000,  # Number of bootstrap iterations
    threshold=0.5      # Classification threshold
)

# Show odds ratios
print("\n===== Odds Ratios (one-time fit) — Contextual Embeddings =====")
print(results["Odds Ratios (one-time fit)"])

# Show bootstrapped metrics
print("\n===== Bootstrapped Metrics (1000 runs) — Contextual Embeddings =====")
for metric, (mean, ci) in results["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")


===== Odds Ratios (one-time fit) — Contextual Embeddings =====
       Odds Ratio  95% CI Lower  95% CI Upper  p-value
const    0.204511      0.197739      0.211516      0.0

===== Bootstrapped Metrics (1000 runs) — Contextual Embeddings =====
AUC: Mean = 0.5000, 95% CI = [0.5000, 0.5000]
Balanced Accuracy: Mean = 0.5000, 95% CI = [0.5000, 0.5000]
Sensitivity: Mean = 0.0000, 95% CI = [0.0000, 0.0000]
Specificity: Mean = 1.0000, 95% CI = [1.0000, 1.0000]
F1-score: Mean = 0.0000, 95% CI = [0.0000, 0.0000]
