# Alzheimer's Prediction using MMSE Features
This notebook runs logistic regression with bootstrapping on MMSCORE and MMSE questions to predict Alzheimer's Disease (AD).

In [153]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from Logistic_bootstrap_metrics import bootstrap_metrics

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score


## Load and Prepare Data

In [154]:
# Load dataset
df = pd.read_csv("data/TrainTest_Table.csv")

# Split into train and test based on 'Split' column
df_train = df[df['Split'] == 'Train']
df_test = df[df['Split'] == 'Test']

# Define MMSE question columns
mmse_cols = [
    # A. Orientation
    "MMYEAR", "MMMONTH", "MMDAY", "MMSEASON", "MMDATE",
    "MMSTATE", "MMCITY", "MMAREA", "MMHOSPIT", "MMFLOOR",

    # B. Registration
    "WORD1", "WORD2", "WORD3",

    # C. Attention and Calculation – Serial 7s only
    "MMD", "MML", "MMR", "MMO", "MMW",

    # D. Recall
    "WORD1DL", "WORD2DL", "WORD3DL",

    # E. Language
    "MMWATCH", "MMPENCIL", "MMREPEAT",
    "MMHAND", "MMFOLD", "MMONFLR",
    "MMREAD", "MMWRITE", "MMDRAW"
]

# Load the saved dataframes with embeddings
df_prompts_train = pd.read_csv("data/MMSE_Prompts_Train.csv")
df_prompts_test = pd.read_csv("data/MMSE_Prompts_Test.csv")


df_context_prompts_train = pd.read_csv("data/MMSE_Context_Prompts_Train.csv")
df_context_prompts_test = pd.read_csv("data/MMSE_Context_Prompts_Test.csv")


## Logistic Regression on MMSCORE with Bootstrapping

In [155]:
# Define features and target
X_train = df_train[['MMSCORE']]
y_train = df_train['AD']
X_test = df_test[['MMSCORE']]
y_test = df_test['AD']

# Run bootstrapped logistic regression
results = bootstrap_metrics(
    df_train=X_train,
    df_test=X_test,
    independent_vars=['MMSCORE'],
    dep_var_train=y_train,
    dep_var_test=y_test,
    n_bootstrap=1000,
    threshold=0.5
)

# Display results
print("\n===== Odds Ratios (one-time fit) — MMSE Score =====")
print(results["Odds Ratios (one-time fit)"])

print("\n===== Bootstrapped Metrics (1000 runs) - MMSE Score =====")
for metric, (mean, ci) in results["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")


===== Odds Ratios (one-time fit) — MMSE Score =====
           Odds Ratio   95% CI Lower  95% CI Upper       p-value
const    1.108315e+07  584766.636652  2.100603e+08  3.205904e-27
MMSCORE  5.208992e-01       0.467053  5.809535e-01  1.066395e-31

===== Bootstrapped Metrics (1000 runs) - MMSE Score =====
AUC: Mean = 0.7946, 95% CI = [0.7893, 0.8066]
Balanced Accuracy: Mean = 0.6498, 95% CI = [0.6437, 0.6546]
Sensitivity: Mean = 0.3209, 95% CI = [0.3091, 0.3273]
Specificity: Mean = 0.9787, 95% CI = [0.9783, 0.9819]
F1-score: Mean = 0.4493, 95% CI = [0.4359, 0.4615]


## Logistic Regression on MMSE Questions with Bootstrapping

In [158]:
def generate_mmseq_ad_pairs(df, mmse_cols):
    """
    Generates a DataFrame with MMSE question names, results, and corresponding AD labels.
    Each MMSE question is treated as an independent feature.
    Args:
        df (pd.DataFrame): DataFrame containing MMSE questions and AD labels.
        mmse_cols (list): List of MMSE question columns.
    Returns:
        pd.DataFrame: DataFrame with MMSE question names, results, and corresponding AD labels.
    """
    mmse_ad_pairs = []
    for col in mmse_cols:
        for _, row in df.iterrows():
            if pd.notnull(row[col]):
                mmse_ad_pairs.append({
                    'Question': col,  # Add the question name
                    'Result': row[col],  # Add the result (0 or 1)
                    'AD': row['AD'],  # Add the AD label
                })
    return pd.DataFrame(mmse_ad_pairs)

# Analyze each MMSE question separately
question_results = []

for question in mmse_cols:
    print(f"\nRunning logistic regression for question: {question}")
    
    # Define features and target for the current question
    X_train_question = df_train[[question]].dropna()  # Feature: Result of the current question
    y_train_question = df_train.loc[X_train_question.index, 'AD']  # Target: AD label for valid rows
    X_test_question = df_test[[question]].dropna()  # Feature: Result of the current question
    y_test_question = df_test.loc[X_test_question.index, 'AD']  # Target: AD label for valid rows
    
    # Check for variability in the data
    if X_train_question[question].nunique() < 2:
        print(f"Skipping question {question} due to lack of variability.")
        continue
    
    # Check for sufficient data
    if len(X_train_question) < 10:  # Adjust the threshold as needed
        print(f"Skipping question {question} due to insufficient data.")
        continue
    
    # Add a constant (intercept) column to the feature matrices
    X_train_question = sm.add_constant(X_train_question)
    X_test_question = sm.add_constant(X_test_question)
    
    # Run logistic regression for the current question
    try:
        results_question = bootstrap_metrics(
            df_train=X_train_question,
            df_test=X_test_question,
            independent_vars=['const', question],  # Include the constant and the question as independent variables
            dep_var_train=y_train_question,  # AD label for training
            dep_var_test=y_test_question,  # AD label for testing
            n_bootstrap=1000,  # Number of bootstrap iterations
            threshold=0.5  # Classification threshold
        )
        
        # Store results
        question_results.append({
            'Question': question,
            'Odds Ratios': results_question["Odds Ratios (one-time fit)"],
            'Bootstrapped Metrics': results_question["Bootstrapped Metrics"]
        })
    except Exception as e:
        print(f"Error running logistic regression for question {question}: {e}")
        continue

# Display results for each question
for result in question_results:
    print(f"\n===== Results for {result['Question']} =====")
    print("Odds Ratios:")
    print(result['Odds Ratios'])
    print("Bootstrapped Metrics:")
    for metric, (mean, ci) in result['Bootstrapped Metrics'].items():
        print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")


Running logistic regression for question: MMYEAR

Running logistic regression for question: MMMONTH

Running logistic regression for question: MMDAY

Running logistic regression for question: MMSEASON

Running logistic regression for question: MMDATE

Running logistic regression for question: MMSTATE


  result = getattr(ufunc, method)(*inputs, **kwargs)



Running logistic regression for question: MMCITY

Running logistic regression for question: MMAREA

Running logistic regression for question: MMHOSPIT

Running logistic regression for question: MMFLOOR

Running logistic regression for question: WORD1
Error running logistic regression for question WORD1: Singular matrix

Running logistic regression for question: WORD2


  result = getattr(ufunc, method)(*inputs, **kwargs)



Running logistic regression for question: WORD3
Error running logistic regression for question WORD3: "['const'] not in index"

Running logistic regression for question: MMD
Error running logistic regression for question MMD: "['const'] not in index"

Running logistic regression for question: MML


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)



Running logistic regression for question: MMR

Running logistic regression for question: MMO

Running logistic regression for question: MMW

Running logistic regression for question: WORD1DL

Running logistic regression for question: WORD2DL

Running logistic regression for question: WORD3DL

Running logistic regression for question: MMWATCH
Error running logistic regression for question MMWATCH: "['const'] not in index"

Running logistic regression for question: MMPENCIL
Error running logistic regression for question MMPENCIL: "['const'] not in index"

Running logistic regression for question: MMREPEAT


  result = getattr(ufunc, method)(*inputs, **kwargs)



Running logistic regression for question: MMHAND

Running logistic regression for question: MMFOLD

Running logistic regression for question: MMONFLR

Running logistic regression for question: MMREAD

Running logistic regression for question: MMWRITE

Running logistic regression for question: MMDRAW

===== Results for MMYEAR =====
Odds Ratios:
        Odds Ratio  95% CI Lower  95% CI Upper       p-value
const     3.600000      1.336592      9.696303  1.128111e-02
MMYEAR    0.049663      0.018088      0.136354  5.658192e-09
Bootstrapped Metrics:
AUC: Mean = 0.5125, 95% CI = [0.5037, 0.5146]
Balanced Accuracy: Mean = 0.5125, 95% CI = [0.5037, 0.5146]
Sensitivity: Mean = 0.0357, 95% CI = [0.0182, 0.0364]
Specificity: Mean = 0.9893, 95% CI = [0.9892, 0.9928]
F1-score: Mean = 0.0655, 95% CI = [0.0339, 0.0678]

===== Results for MMMONTH =====
Odds Ratios:
         Odds Ratio  95% CI Lower  95% CI Upper       p-value
const      3.000000      1.275341      7.056934  1.182792e-02
MMMONTH    0.

## Logistic Regression on Embeddings with Bootstrapping

In [138]:
# Define features and target
embedding_cols = [col for col in df_prompts_train.columns if col.startswith("Embedding_")]
X_train_embeddings = df_prompts_train[embedding_cols].values
X_test_embeddings = df_prompts_test[embedding_cols].values
y_train_embeddings = df_prompts_train['AD'].values
y_test_embeddings = df_prompts_test['AD'].values

# Run bootstrapped logistic regression on embeddings
results = bootstrap_metrics(
    df_train=pd.DataFrame(X_train_embeddings, columns=embedding_cols),
    df_test=pd.DataFrame(X_test_embeddings, columns=embedding_cols),
    independent_vars=embedding_cols,  # Use embedding column names
    dep_var_train=pd.Series(y_train_embeddings),
    dep_var_test=pd.Series(y_test_embeddings),
    n_bootstrap=1000,  # Number of bootstrap iterations
    threshold=0.5      # Classification threshold
)

# Show odds ratios
print("\n===== Odds Ratios (one-time fit) — Embeddings =====")
print(results["Odds Ratios (one-time fit)"])

# Show bootstrapped metrics
print("\n===== Bootstrapped Metrics (1000 runs) — Embeddings =====")
for metric, (mean, ci) in results["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  result = getattr(ufunc, method)(*inputs, **kwargs)
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np


===== Odds Ratios (one-time fit) — Embeddings =====
               Odds Ratio  95% CI Lower  95% CI Upper        p-value
const                 0.0           NaN           NaN            NaN
Embedding_0           0.0           0.0           0.0  1.199686e-220
Embedding_1           0.0           NaN           NaN            NaN
Embedding_2           0.0           NaN           NaN            NaN
Embedding_3           inf           NaN           NaN            NaN
...                   ...           ...           ...            ...
Embedding_763         inf           NaN           NaN            NaN
Embedding_764         0.0           0.0           0.0  1.798735e-274
Embedding_765         inf           NaN           NaN            NaN
Embedding_766         0.0           0.0           0.0   0.000000e+00
Embedding_767         0.0           0.0           0.0   0.000000e+00

[769 rows x 4 columns]

===== Bootstrapped Metrics (1000 runs) — Embeddings =====
AUC: Mean = 0.5652, 95% CI = [0.5649

  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))


## Logistic Regression on Contextual Embeddings with Bootstrapping

In [None]:
# Define features and target
context_embedding_cols = [col for col in df_context_prompts_train.columns if col.startswith("Context_Embedding_")]
X_train_embeddings = df_context_prompts_train[context_embedding_cols].values
X_test_embeddings = df_context_prompts_test[context_embedding_cols].values
y_train_embeddings = df_context_prompts_train['AD'].values
y_test_embeddings = df_context_prompts_test['AD'].values

# Run bootstrapped logistic regression on embeddings
results = bootstrap_metrics(
    df_train=pd.DataFrame(X_train_embeddings, columns=context_embedding_cols),
    df_test=pd.DataFrame(X_test_embeddings, columns=context_embedding_cols),
    independent_vars=context_embedding_cols,  # Use embedding column names
    dep_var_train=pd.Series(y_train_embeddings),
    dep_var_test=pd.Series(y_test_embeddings),
    n_bootstrap=1000,  # Number of bootstrap iterations
    threshold=0.5      # Classification threshold
)

# Show odds ratios
print("\n===== Odds Ratios (one-time fit) — Contextual Embeddings =====")
print(results["Odds Ratios (one-time fit)"])

# Show bootstrapped metrics
print("\n===== Bootstrapped Metrics (1000 runs) — Contextual Embeddings =====")
for metric, (mean, ci) in results["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  result = getattr(ufunc, method)(*inputs, **kwargs)



===== Odds Ratios (one-time fit) — Contextual Embeddings =====
                       Odds Ratio  95% CI Lower  95% CI Upper       p-value
const                         inf           NaN           NaN           NaN
Context_Embedding_0           0.0           NaN           NaN           NaN
Context_Embedding_1           inf           NaN           NaN           NaN
Context_Embedding_2           inf           inf           inf  3.679157e-02
Context_Embedding_3           inf           NaN           NaN           NaN
...                           ...           ...           ...           ...
Context_Embedding_763         inf           NaN           NaN           NaN
Context_Embedding_764         0.0           0.0           0.0  6.958955e-05
Context_Embedding_765         inf           0.0           inf  6.524462e-01
Context_Embedding_766         0.0           0.0           inf  5.201351e-01
Context_Embedding_767         0.0           0.0           0.0  9.503478e-11

[769 rows x 4 columns]
