# Alzheimer's Prediction using MMSE Features
This notebook runs logistic regression with bootstrapping on MMSCORE and MMSE questions to predict Alzheimer's Disease (AD).

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from Logistic_bootstrap_metrics import bootstrap_metrics

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report


## Load and Prepare Data

In [None]:
# Load dataset
df = pd.read_csv("data/Filtered_Data.csv")

# Split into train and test based on 'Split' column
df_train = df[df['Split'] == 'Train']
df_test = df[df['Split'] == 'Test']

# Define MMSE question columns
mmse_cols = [
    # A. Orientation
    "MMYEAR", "MMMONTH", "MMDAY", "MMSEASON", "MMDATE",
    "MMSTATE", "MMCITY", "MMAREA", "MMHOSPIT", "MMFLOOR",

    # B. Registration
    "WORD1", "WORD2", "WORD3",

    # C. Attention and Calculation – Serial 7s only
    "MMD", "MML", "MMR", "MMO", "MMW",

    # D. Recall
    "WORD1DL", "WORD2DL", "WORD3DL",

    # E. Language
    "MMWATCH", "MMPENCIL", "MMREPEAT",
    "MMHAND", "MMFOLD", "MMONFLR",
    "MMREAD", "MMWRITE", "MMDRAW"
]

# Define MMSE question categories
categories = {
    "Orientation_Time": ["MMYEAR", "MMMONTH", "MMDAY", "MMSEASON", "MMDATE"],
    "Orientation_Place": ["MMSTATE", "MMCITY", "MMAREA", "MMHOSPIT", "MMFLOOR"],
    "Registration": ["WORD1", "WORD2", "WORD3"],
    "Attention_Calculation": ["MMD", "MML", "MMR", "MMO", "MMW", "MMLTR1", "MMLTR2", "MMLTR3", "MMLTR4", "MMLTR5", "MMLTR6", "MMLTR7"],
    "Recall": ["WORD1DL", "WORD2DL", "WORD3DL"],
    "Language": ["MMWATCH", "MMPENCIL", "MMREPEAT", "MMHAND", "MMFOLD", "MMONFLR", "MMREAD", "MMWRITE", "MMDRAW"]
}

# ClinicalBERT embeddings
df_clincalbert_train = pd.read_csv("data/Embeddings/ClinicalBERT_Train.csv")
df_clincalbert_test = pd.read_csv("data/Embeddings/ClinicalBERT_Test.csv")
df_context_clinicalbert_test = pd.read_csv("data/Embeddings/Context_ClinicalBERT_Train.csv")
df_context_clinicalbert_train = pd.read_csv("data/Embeddings/Context_ClinicalBERT_Test.csv")

# MedBERT embeddings


## Logistic Regression on MMSCORE with Bootstrapping

In [None]:
# Define features and target
X_train = df_train[['MMSCORE']]
y_train = df_train['AD']
X_test = df_test[['MMSCORE']]
y_test = df_test['AD']

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
sample_weights = y_train.map(dict(zip(np.unique(y_train), class_weights)))

# Run bootstrapped logistic regression with class weighting
results = bootstrap_metrics(
    df_train=X_train,
    df_test=X_test,
    independent_vars=['MMSCORE'],
    dep_var_train=y_train,
    dep_var_test=y_test,
    n_bootstrap=1000,
    threshold=0.5,
    sample_weights=sample_weights  # Pass class weights
)

# Display results
print("\n===== Odds Ratios (one-time fit) — MMSE Score =====")
print(results["Odds Ratios (one-time fit)"])

print("\n===== Bootstrapped Metrics (1000 runs) - MMSE Score =====")
for metric, (mean, ci) in results["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")

## Logistic Regression on MMSE Questions with Bootstrapping

In [None]:
# Logistic Regression for Every MMSE Question
question_results = []
skipped_questions = []

for question in mmse_cols:
    try:
        # Define features and target
        X_train_question = sm.add_constant(df_train[[question]], has_constant='add')
        y_train_question = df_train['AD']
        X_test_question = sm.add_constant(df_test[[question]], has_constant='add')
        y_test_question = df_test['AD']

        # Compute class weights
        lass_weights = compute_class_weight('balanced', classes=np.unique(y_train_question), y=y_train_question)
        sample_weights = y_train_question.map(dict(zip(np.unique(y_train_question), class_weights)))

        # Run logistic regression with bootstrapping
        results_question = bootstrap_metrics(
            df_train=X_train_question,
            df_test=X_test_question,
            independent_vars=[question],  # Use only the question column
            dep_var_train=y_train_question,
            dep_var_test=y_test_question,
            n_bootstrap=1000,
            threshold=0.5,
            sample_weights=sample_weights
        )

        # Store results
        question_results.append({
            'Question': question,
            'Odds Ratios': results_question["Odds Ratios (one-time fit)"],
            'Bootstrapped Metrics': results_question["Bootstrapped Metrics"]
        })

    except Exception as e:
        print(f"Error running logistic regression for question {question}: {e}")
        skipped_questions.append({'Question': question, 'Error': str(e)})

# Display results for each question
for result in question_results:
    print(f"\n===== Odds Ratios (one-time fit) — {result['Question']} =====")
    print(result['Odds Ratios'])

    print(f"\n===== Bootstrapped Metrics (1000 runs) - {result['Question']} =====")
    for metric, (mean, ci) in result['Bootstrapped Metrics'].items():
        print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")

# Log skipped questions
if skipped_questions:
    print("\n===== Skipped Questions =====")
    for skipped in skipped_questions:
        print(f"Question: {skipped['Question']}, Error: {skipped['Error']}")

## Logistic Regression on MMSE Category with Bootstrapping

In [None]:
# Create composite scores for each category
for category, questions in categories.items():
    df_train[f"{category}_COMPOSITE"] = df_train[questions].sum(axis=1, skipna=True)
    df_test[f"{category}_COMPOSITE"] = df_test[questions].sum(axis=1, skipna=True)

# Analyze each category
category_results = []

for category in categories.keys():
    # print(f"\nRunning logistic regression for category: {category}")
    
    # Define features and target
    X_train_category = sm.add_constant(df_train[[f"{category}_COMPOSITE"]])
    y_train_category = df_train['AD']
    X_test_category = sm.add_constant(df_test[[f"{category}_COMPOSITE"]])
    y_test_category = df_test['AD']

    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train_category), y=y_train_category)
    sample_weights = y_train_category.map(dict(zip(np.unique(y_train_category), class_weights)))
    
    # Run logistic regression
    try:
        results_category = bootstrap_metrics(
            df_train=X_train_category,
            df_test=X_test_category,
            independent_vars=[f"{category}_COMPOSITE"],
            dep_var_train=y_train_category,
            dep_var_test=y_test_category,
            n_bootstrap=1000,
            threshold=0.5,
            sample_weights=sample_weights  # Pass class weights
        )
        
        # Store results
        category_results.append({
            'Category': category,
            'Odds Ratios': results_category["Odds Ratios (one-time fit)"],
            'Bootstrapped Metrics': results_category["Bootstrapped Metrics"]
        })
    except Exception as e:
        print(f"Error running logistic regression for category {category}: {e}")
        continue

# Display results for each category
for result in category_results:
    print(f"\n============ Results for {result['Category']} ============")
    print("----- Odds Ratios (one-time fit) -----")
    print(result['Odds Ratios'])
    print()
    print("----- Bootstrapped Metrics (1000 runs) -----")
    for metric, (mean, ci) in result['Bootstrapped Metrics'].items():
        print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")

# Combine all categories into a single model
X_train_combined = sm.add_constant(df_train[[f"{category}_COMPOSITE" for category in categories.keys()]])
X_test_combined = sm.add_constant(df_test[[f"{category}_COMPOSITE" for category in categories.keys()]])
y_train_combined = df_train['AD']
y_test_combined = df_test['AD']

# Compute class weights for the combined model
class_weights_combined = compute_class_weight('balanced', classes=np.unique(y_train_combined), y=y_train_combined)
sample_weights_combined = y_train_combined.map(dict(zip(np.unique(y_train_combined), class_weights_combined)))

# Run logistic regression on the combined model
results_combined = bootstrap_metrics(
    df_train=X_train_combined,
    df_test=X_test_combined,
    independent_vars=[f"{category}_COMPOSITE" for category in categories.keys()],
    dep_var_train=y_train_combined,
    dep_var_test=y_test_combined,
    n_bootstrap=1000,
    threshold=0.5,
    sample_weights=sample_weights_combined  # Pass class weights
)

# Display combined model results
print("\n============ Results for Combined Model ============")
print("----- Odds Ratios (one-time fit) -----")
print(results_combined["Odds Ratios (one-time fit)"])
print()
print("----- Bootstrapped Metrics (1000 runs) -----")
for metric, (mean, ci) in results_combined["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")

In [None]:
# Recompute Composite Scores (excluding Registration)
for category, questions in categories.items():
    if category != "Registration":
        df_train[f"{category}_COMPOSITE"] = df_train[questions].sum(axis=1, skipna=True)
        df_test[f"{category}_COMPOSITE"] = df_test[questions].sum(axis=1, skipna=True)

# Recompute the combined model excluding Registration
X_train_combined_no_registration = sm.add_constant(df_train[[f"{category}_COMPOSITE" for category in categories.keys() if category != "Registration"]])
X_test_combined_no_registration = sm.add_constant(df_test[[f"{category}_COMPOSITE" for category in categories.keys() if category != "Registration"]])
y_train_combined_no_registration = df_train['AD']
y_test_combined_no_registration = df_test['AD']

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_combined_no_registration), y=y_train_combined_no_registration)
sample_weights = y_train_combined_no_registration.map(dict(zip(np.unique(y_train_combined_no_registration), class_weights)))

# Run logistic regression on the combined model (excluding Registration)
results_combined_no_registration = bootstrap_metrics(
    df_train=X_train_combined_no_registration,
    df_test=X_test_combined_no_registration,
    independent_vars=[f"{category}_COMPOSITE" for category in categories.keys() if category != "Registration"],
    dep_var_train=y_train_combined_no_registration,
    dep_var_test=y_test_combined_no_registration,
    n_bootstrap=1000,
    threshold=0.5,
    sample_weights=sample_weights  # Pass class weights
)
# Display combined model results (excluding Registration)
print("\n============ Results for Combined Model (excluding Registration) ============")
print("----- Odds Ratios (one-time fit) -----")
print(results_combined_no_registration["Odds Ratios (one-time fit)"])
print()
print("----- Bootstrapped Metrics (1000 runs) -----")
for metric, (mean, ci) in results_combined_no_registration["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")

# Combine all categories into a single model (execept for Registration)
X_train_combined_no_registration = sm.add_constant(df_train[[f"{category}_COMPOSITE" for category in categories.keys() if category != "Registration"]])
X_test_combined_no_registration = sm.add_constant(df_test[[f"{category}_COMPOSITE" for category in categories.keys() if category != "Registration"]])
y_train_combined_no_registration = df_train['AD']
y_test_combined_no_registration = df_test['AD']

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_combined_no_registration), y=y_train_combined_no_registration)
sample_weights = y_train_combined_no_registration.map(dict(zip(np.unique(y_train_combined_no_registration), class_weights)))

# Run logistic regression on the combined model (excluding Registration)
results_combined_no_registration = bootstrap_metrics(
    df_train=X_train_combined_no_registration,
    df_test=X_test_combined_no_registration,
    independent_vars=[f"{category}_COMPOSITE" for category in categories.keys() if category != "Registration"],
    dep_var_train=y_train_combined_no_registration,
    dep_var_test=y_test_combined_no_registration,
    n_bootstrap=1000,
    threshold=0.5
)
# Display combined model results (excluding Registration)
print("\n============ Results for Combined Model (excluding Registration) ============")
print("----- Odds Ratios (one-time fit) -----")
print(results_combined_no_registration["Odds Ratios (one-time fit)"])
print()
print("----- Bootstrapped Metrics (1000 runs) -----")
for metric, (mean, ci) in results_combined_no_registration["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")
# Display the results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train[f"{category}_COMPOSITE"] = df_train[questions].sum(axis=1, skipna=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[f"{category}_COMPOSITE"] = df_test[questions].sum(axis=1, skipna=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train[f"{category}_COMPOSITE"] = df_train[


----- Odds Ratios (one-time fit) -----
                                    Odds Ratio  95% CI Lower  95% CI Upper  \
const                            484939.074968   7691.066865  3.057650e+07   
Orientation_Time_COMPOSITE            0.366278      0.273737  4.901033e-01   
Orientation_Place_COMPOSITE           0.461352      0.295355  7.206420e-01   
Attention_Calculation_COMPOSITE       0.647267      0.482081  8.690557e-01   
Recall_COMPOSITE                      0.527801      0.432650  6.438787e-01   
Language_COMPOSITE                    0.679824      0.470374  9.825391e-01   

                                      p-value  
const                            5.940638e-10  
Orientation_Time_COMPOSITE       1.385024e-11  
Orientation_Place_COMPOSITE      6.745266e-04  
Attention_Calculation_COMPOSITE  3.809228e-03  
Recall_COMPOSITE                 2.966219e-10  
Language_COMPOSITE               4.000435e-02  

----- Bootstrapped Metrics (1000 runs) -----
AUC: Mean = 0.7975, 95% CI = [0




----- Odds Ratios (one-time fit) -----
                                    Odds Ratio  95% CI Lower  95% CI Upper  \
const                            484939.074968   7691.066865  3.057650e+07   
Orientation_Time_COMPOSITE            0.366278      0.273737  4.901033e-01   
Orientation_Place_COMPOSITE           0.461352      0.295355  7.206420e-01   
Attention_Calculation_COMPOSITE       0.647267      0.482081  8.690557e-01   
Recall_COMPOSITE                      0.527801      0.432650  6.438787e-01   
Language_COMPOSITE                    0.679824      0.470374  9.825391e-01   

                                      p-value  
const                            5.940638e-10  
Orientation_Time_COMPOSITE       1.385024e-11  
Orientation_Place_COMPOSITE      6.745266e-04  
Attention_Calculation_COMPOSITE  3.809228e-03  
Recall_COMPOSITE                 2.966219e-10  
Language_COMPOSITE               4.000435e-02  

----- Bootstrapped Metrics (1000 runs) -----
AUC: Mean = 0.7975, 95% CI = [0

## Logistic Regression on ClinicalBERT with Bootstrapping

In [None]:
# Define features and target
clinicalbert_embedding_cols = [col for col in df_clincalbert_train.columns if col.startswith("Embedding_")]
X_train_embeddings = df_clincalbert_train[clinicalbert_embedding_cols].values
X_test_embeddings = df_clincalbert_test[clinicalbert_embedding_cols].values
y_train_embeddings = df_clincalbert_train['AD'].values
y_test_embeddings = df_clincalbert_test['AD'].values

# Run bootstrapped logistic regression on embeddings
results = bootstrap_metrics(
    df_train=pd.DataFrame(X_train_embeddings, columns=clinicalbert_embedding_cols),
    df_test=pd.DataFrame(X_test_embeddings, columns=clinicalbert_embedding_cols),
    independent_vars=clinicalbert_embedding_cols,  # Use embedding column names
    dep_var_train=pd.Series(y_train_embeddings),
    dep_var_test=pd.Series(y_test_embeddings),
    n_bootstrap=1000,  # Number of bootstrap iterations
    threshold=0.5      # Classification threshold
)

# Show odds ratios
print("\n===== Odds Ratios (one-time fit) — Embeddings =====")
print(results["Odds Ratios (one-time fit)"])

# Show bootstrapped metrics
print("\n===== Bootstrapped Metrics (1000 runs) — Embeddings =====")
for metric, (mean, ci) in results["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")

## Logistic Regression on Contextual ClinicalBERT with Bootstrapping

In [None]:
# Define features and target
context_clinicalbert_embedding_cols = [col for col in df_context_clinicalbert_train.columns if col.startswith("Context_Embedding_")]
X_train_embeddings = df_context_clinicalbert_train[context_clinicalbert_embedding_cols].values
X_test_embeddings = df_context_clinicalbert_test[context_clinicalbert_embedding_cols].values
y_train_embeddings = df_context_clinicalbert_train['AD'].values
y_test_embeddings = df_context_clinicalbert_test['AD'].values

# Run bootstrapped logistic regression on embeddings
results = bootstrap_metrics(
    df_train=pd.DataFrame(X_train_embeddings, columns=context_clinicalbert_embedding_cols),
    df_test=pd.DataFrame(X_test_embeddings, columns=context_clinicalbert_embedding_cols),
    independent_vars=context_clinicalbert_embedding_cols,  # Use embedding column names
    dep_var_train=pd.Series(y_train_embeddings),
    dep_var_test=pd.Series(y_test_embeddings),
    n_bootstrap=1000,  # Number of bootstrap iterations
    threshold=0.5      # Classification threshold
)

# Show odds ratios
print("\n===== Odds Ratios (one-time fit) — Contextual Embeddings =====")
print(results["Odds Ratios (one-time fit)"])

# Show bootstrapped metrics
print("\n===== Bootstrapped Metrics (1000 runs) — Contextual Embeddings =====")
for metric, (mean, ci) in results["Bootstrapped Metrics"].items():
    print(f"{metric}: Mean = {mean:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")