# Alzheimer's Prediction using MMSE Features
This notebook runs logistic regression with bootstrapping on MMSCORE and MMSE questions to predict Alzheimer's Disease (AD).

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from Logistic_bootstrap_metrics import bootstrap_metrics

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score


## Load and Prepare Data

In [166]:
# Load dataset
df = pd.read_csv("data/TrainTest_Table.csv")

# Split into train and test based on 'Split' column
df_train = df[df['Split'] == 'Train']
df_test = df[df['Split'] == 'Test']

# Define MMSE question columns
mmse_cols = [
    # A. Orientation
    "MMYEAR", "MMMONTH", "MMDAY", "MMSEASON", "MMDATE",
    "MMSTATE", "MMCITY", "MMAREA", "MMHOSPIT", "MMFLOOR",

    # B. Registration
    "WORD1", "WORD2", "WORD3",

    # C. Attention and Calculation – Serial 7s only
    "MMD", "MML", "MMR", "MMO", "MMW",

    # D. Recall
    "WORD1DL", "WORD2DL", "WORD3DL",

    # E. Language
    "MMWATCH", "MMPENCIL", "MMREPEAT",
    "MMHAND", "MMFOLD", "MMONFLR",
    "MMREAD", "MMWRITE", "MMDRAW"
]

# Define MMSE question categories
categories = {
    "Orientation_Time": ["MMYEAR", "MMMONTH", "MMDAY", "MMSEASON", "MMDATE"],
    "Orientation_Place": ["MMSTATE", "MMCITY", "MMAREA", "MMHOSPIT", "MMFLOOR"],
    "Registration": ["WORD1", "WORD2", "WORD3"],
    "Attention_Calculation": ["MMD", "MML", "MMR", "MMO", "MMW", "MMLTR1", "MMLTR2", "MMLTR3", "MMLTR4", "MMLTR5", "MMLTR6", "MMLTR7"],
    "Recall": ["WORD1DL", "WORD2DL", "WORD3DL"],
    "Language": ["MMWATCH", "MMPENCIL", "MMREPEAT", "MMHAND", "MMFOLD", "MMONFLR", "MMREAD", "MMWRITE", "MMDRAW"]
}

# Load the saved dataframes with embeddings
df_prompts_train = pd.read_csv("data/MMSE_Prompts_Train.csv")
df_prompts_test = pd.read_csv("data/MMSE_Prompts_Test.csv")


df_context_prompts_train = pd.read_csv("data/MMSE_Context_Prompts_Train.csv")
df_context_prompts_test = pd.read_csv("data/MMSE_Context_Prompts_Test.csv")


## Logistic Regression on MMSCORE with Bootstrapping

In [167]:
# Define features and target
X_train = df_train[['MMSCORE']]
y_train = df_train['AD']
X_test = df_test[['MMSCORE']]
y_test = df_test['AD']

# Run bootstrapped logistic regression
results = bootstrap_metrics(
    df_train=X_train,
    df_test=X_test,
    independent_vars=['MMSCORE'],
    dep_var_train=y_train,
    dep_var_test=y_test,
    n_bootstrap=1000,
    threshold=0.5
)

# Display results
print("\n===== Odds Ratios (one-time fit) — MMSE Score =====")
print(results["Odds Ratios (one-time fit)"])

print("\n===== Bootstrapped Metrics (1000 runs) - MMSE Score =====")
for metric, (mean, ci) in results["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")


===== Odds Ratios (one-time fit) — MMSE Score =====
           Odds Ratio   95% CI Lower  95% CI Upper       p-value
const    1.108315e+07  584766.636652  2.100603e+08  3.205904e-27
MMSCORE  5.208992e-01       0.467053  5.809535e-01  1.066395e-31

===== Bootstrapped Metrics (1000 runs) - MMSE Score =====
AUC: Mean = 0.7946, 95% CI = [0.7893, 0.8066]
Balanced Accuracy: Mean = 0.6498, 95% CI = [0.6437, 0.6546]
Sensitivity: Mean = 0.3209, 95% CI = [0.3091, 0.3273]
Specificity: Mean = 0.9787, 95% CI = [0.9783, 0.9819]
F1-score: Mean = 0.4493, 95% CI = [0.4359, 0.4615]


## Logistic Regression on MMSE Questions with Bootstrapping

In [171]:
def generate_mmseq_ad_pairs(df, mmse_cols):
    """
    Generates a DataFrame with MMSE question and corresponding AD labels of the same row.
    Each MMSE question is treated as an independent feature.
    Args:
        df (pd.DataFrame): DataFrame containing MMSE questions and AD labels.
        mmse_cols (list): List of MMSE question columns.
    Returns:
        pd.DataFrame: DataFrame with MMSE question results and corresponding AD labels.
    """
    mmse_ad_pairs = []
    for col in mmse_cols:
        for _, row in df.iterrows():
            if pd.notnull(row[col]):
                mmse_ad_pairs.append({
                    'Result': row[col],
                    'AD': row['AD'],
                })
    return pd.DataFrame(mmse_ad_pairs)

# Create composite scores for each category
for category, questions in categories.items():
    df_train[f"{category}_COMPOSITE"] = df_train[questions].sum(axis=1, skipna=True)
    df_test[f"{category}_COMPOSITE"] = df_test[questions].sum(axis=1, skipna=True)

# Analyze each category
category_results = []

for category in categories.keys():
    # print(f"\nRunning logistic regression for category: {category}")
    
    # Define features and target
    X_train_category = sm.add_constant(df_train[[f"{category}_COMPOSITE"]])
    y_train_category = df_train['AD']
    X_test_category = sm.add_constant(df_test[[f"{category}_COMPOSITE"]])
    y_test_category = df_test['AD']
    
    # Handle class imbalance with SMOTE
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train_category, y_train_category)
    
    # Run logistic regression
    try:
        results_category = bootstrap_metrics(
            df_train=X_train_balanced,
            df_test=X_test_category,
            independent_vars=['const', f"{category}_COMPOSITE"],
            dep_var_train=y_train_balanced,
            dep_var_test=y_test_category,
            n_bootstrap=1000,
            threshold=0.5
        )
        
        # Store results
        category_results.append({
            'Category': category,
            'Odds Ratios': results_category["Odds Ratios (one-time fit)"],
            'Bootstrapped Metrics': results_category["Bootstrapped Metrics"]
        })
    except Exception as e:
        print(f"Error running logistic regression for category {category}: {e}")
        continue

# Display results for each category
for result in category_results:
    print(f"\n===== Results for {result['Category']} =====")
    print("Odds Ratios:")
    print(result['Odds Ratios'])
    print("Bootstrapped Metrics:")
    for metric, (mean, ci) in result['Bootstrapped Metrics'].items():
        print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")

# Combine all categories into a single model
X_train_combined = sm.add_constant(df_train[[f"{category}_COMPOSITE" for category in categories.keys()]])
X_test_combined = sm.add_constant(df_test[[f"{category}_COMPOSITE" for category in categories.keys()]])
y_train_combined = df_train['AD']
y_test_combined = df_test['AD']

# Run logistic regression on the combined model
results_combined = bootstrap_metrics(
    df_train=X_train_combined,
    df_test=X_test_combined,
    independent_vars=['const'] + [f"{category}_COMPOSITE" for category in categories.keys()],
    dep_var_train=y_train_combined,
    dep_var_test=y_test_combined,
    n_bootstrap=1000,
    threshold=0.5
)

# Display combined model results
print("\n===== Results for Combined Model =====")
print("Odds Ratios:")
print(results_combined["Odds Ratios (one-time fit)"])
print("Bootstrapped Metrics:")
for metric, (mean, ci) in results_combined["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")

# Train a random forest classifier on the combined model
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train_combined, y_train_combined)

# Predict probabilities and evaluate
y_pred_probs = rf.predict_proba(X_test_combined)[:, 1]
auc = roc_auc_score(y_test_combined, y_pred_probs)
print(f"\nRandom Forest AUC: {auc:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train[f"{category}_COMPOSITE"] = df_train[questions].sum(axis=1, skipna=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[f"{category}_COMPOSITE"] = df_test[questions].sum(axis=1, skipna=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train[f"{category}_COMPOSITE"] = df_train[


===== Results for Orientation_Time =====
Odds Ratios:
                            Odds Ratio  95% CI Lower  95% CI Upper  \
const                       832.424739    324.769077   2133.611219   
Orientation_Time_COMPOSITE    0.219171      0.178930      0.268464   

                                 p-value  
const                       1.506653e-44  
Orientation_Time_COMPOSITE  1.075004e-48  
Bootstrapped Metrics:
AUC: Mean = 0.6507, 95% CI = [0.6432, 0.6575]
Balanced Accuracy: Mean = 0.6343, 95% CI = [0.6281, 0.6408]
Sensitivity: Mean = 0.4822, 95% CI = [0.4727, 0.4909]
Specificity: Mean = 0.7865, 95% CI = [0.7834, 0.7942]
F1-score: Mean = 0.3770, 95% CI = [0.3688, 0.3857]

===== Results for Orientation_Place =====
Odds Ratios:
                              Odds Ratio  95% CI Lower  95% CI Upper  \
const                        1669.146820    478.250180   5825.509789   
Orientation_Place_COMPOSITE     0.206352      0.159132      0.267584   

                                  p-value  
c

  result = getattr(ufunc, method)(*inputs, **kwargs)



===== Results for Combined Model =====
Odds Ratios:
                                   Odds Ratio  95% CI Lower  95% CI Upper  \
const                            2.729334e-22      0.000000           inf   
Orientation_Time_COMPOSITE       3.431941e-01      0.253865      0.463956   
Orientation_Place_COMPOSITE      4.629046e-01      0.294162      0.728444   
Registration_COMPOSITE           1.475229e+09      0.000000           inf   
Attention_Calculation_COMPOSITE  6.587537e-01      0.488677      0.888023   
Recall_COMPOSITE                 5.176974e-01      0.424054      0.632020   
Language_COMPOSITE               6.543024e-01      0.451229      0.948769   

                                      p-value  
const                            9.989788e-01  
Orientation_Time_COMPOSITE       3.591096e-12  
Orientation_Place_COMPOSITE      8.695491e-04  
Registration_COMPOSITE           9.986974e-01  
Attention_Calculation_COMPOSITE  6.156076e-03  
Recall_COMPOSITE                 9.994602e

## Logistic Regression on Embeddings with Bootstrapping

In [169]:
# Define features and target
embedding_cols = [col for col in df_prompts_train.columns if col.startswith("Embedding_")]
X_train_embeddings = df_prompts_train[embedding_cols].values
X_test_embeddings = df_prompts_test[embedding_cols].values
y_train_embeddings = df_prompts_train['AD'].values
y_test_embeddings = df_prompts_test['AD'].values

# Run bootstrapped logistic regression on embeddings
results = bootstrap_metrics(
    df_train=pd.DataFrame(X_train_embeddings, columns=embedding_cols),
    df_test=pd.DataFrame(X_test_embeddings, columns=embedding_cols),
    independent_vars=embedding_cols,  # Use embedding column names
    dep_var_train=pd.Series(y_train_embeddings),
    dep_var_test=pd.Series(y_test_embeddings),
    n_bootstrap=1000,  # Number of bootstrap iterations
    threshold=0.5      # Classification threshold
)

# Show odds ratios
print("\n===== Odds Ratios (one-time fit) — Embeddings =====")
print(results["Odds Ratios (one-time fit)"])

# Show bootstrapped metrics
print("\n===== Bootstrapped Metrics (1000 runs) — Embeddings =====")
for metric, (mean, ci) in results["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  result = getattr(ufunc, method)(*inputs, **kwargs)
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np


===== Odds Ratios (one-time fit) — Embeddings =====
               Odds Ratio  95% CI Lower  95% CI Upper        p-value
const                 0.0           NaN           NaN            NaN
Embedding_0           0.0           0.0           0.0  1.199686e-220
Embedding_1           0.0           NaN           NaN            NaN
Embedding_2           0.0           NaN           NaN            NaN
Embedding_3           inf           NaN           NaN            NaN
...                   ...           ...           ...            ...
Embedding_763         inf           NaN           NaN            NaN
Embedding_764         0.0           0.0           0.0  1.798735e-274
Embedding_765         inf           NaN           NaN            NaN
Embedding_766         0.0           0.0           0.0   0.000000e+00
Embedding_767         0.0           0.0           0.0   0.000000e+00

[769 rows x 4 columns]

===== Bootstrapped Metrics (1000 runs) — Embeddings =====
AUC: Mean = 0.5652, 95% CI = [0.5649

  return 1/(1+np.exp(-X))


## Logistic Regression on Contextual Embeddings with Bootstrapping

In [170]:
# Define features and target
context_embedding_cols = [col for col in df_context_prompts_train.columns if col.startswith("Context_Embedding_")]
X_train_embeddings = df_context_prompts_train[context_embedding_cols].values
X_test_embeddings = df_context_prompts_test[context_embedding_cols].values
y_train_embeddings = df_context_prompts_train['AD'].values
y_test_embeddings = df_context_prompts_test['AD'].values

# Run bootstrapped logistic regression on embeddings
results = bootstrap_metrics(
    df_train=pd.DataFrame(X_train_embeddings, columns=context_embedding_cols),
    df_test=pd.DataFrame(X_test_embeddings, columns=context_embedding_cols),
    independent_vars=context_embedding_cols,  # Use embedding column names
    dep_var_train=pd.Series(y_train_embeddings),
    dep_var_test=pd.Series(y_test_embeddings),
    n_bootstrap=1000,  # Number of bootstrap iterations
    threshold=0.5      # Classification threshold
)

# Show odds ratios
print("\n===== Odds Ratios (one-time fit) — Contextual Embeddings =====")
print(results["Odds Ratios (one-time fit)"])

# Show bootstrapped metrics
print("\n===== Bootstrapped Metrics (1000 runs) — Contextual Embeddings =====")
for metric, (mean, ci) in results["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  result = getattr(ufunc, method)(*inputs, **kwargs)



===== Odds Ratios (one-time fit) — Contextual Embeddings =====
                       Odds Ratio  95% CI Lower  95% CI Upper       p-value
const                         inf           NaN           NaN           NaN
Context_Embedding_0           0.0           NaN           NaN           NaN
Context_Embedding_1           inf           NaN           NaN           NaN
Context_Embedding_2           inf           inf           inf  3.679157e-02
Context_Embedding_3           inf           NaN           NaN           NaN
...                           ...           ...           ...           ...
Context_Embedding_763         inf           NaN           NaN           NaN
Context_Embedding_764         0.0           0.0           0.0  6.958955e-05
Context_Embedding_765         inf           0.0           inf  6.524462e-01
Context_Embedding_766         0.0           0.0           inf  5.201351e-01
Context_Embedding_767         0.0           0.0           0.0  9.503478e-11

[769 rows x 4 columns]
