In [30]:
# --- 0. Imports ---

# Import all the libraries you'll need at the top.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

# Imbalanced Data Handling
from imblearn.over_sampling import SMOTE
# Use the imblearn pipeline to correctly apply SMOTE during CV
from imblearn.pipeline import Pipeline as ImbPipeline 

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Evaluation
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_auc_score, 
    f1_score,
    roc_curve,
    precision_recall_curve
)


# Settings
sns.set(style="whitegrid")
warnings.filterwarnings('ignore')

In [32]:
Dataset = pd.read_csv("D:\\Downloads\\Dataset\\Dementia Prediction Dataset.csv", low_memory=False)

print(list(Dataset))


['NACCID', 'NACCADC', 'PACKET', 'FORMVER', 'VISITMO', 'VISITDAY', 'VISITYR', 'NACCVNUM', 'NACCAVST', 'NACCNVST', 'NACCDAYS', 'NACCFDYS', 'NACCCORE', 'NACCREAS', 'NACCREFR', 'BIRTHMO', 'BIRTHYR', 'SEX', 'HISPANIC', 'HISPOR', 'HISPORX', 'RACE', 'RACEX', 'RACESEC', 'RACESECX', 'RACETER', 'RACETERX', 'PRIMLANG', 'PRIMLANX', 'EDUC', 'MARISTAT', 'NACCLIVS', 'INDEPEND', 'RESIDENC', 'HANDED', 'INBIRMO', 'INBIRYR', 'INSEX', 'NEWINF', 'INHISP', 'INHISPOR', 'INHISPOX', 'NACCNINR', 'INRACE', 'INRACEX', 'INRASEC', 'INRASECX', 'INRATER', 'INRATERX', 'INEDUC', 'INRELTO', 'INRELTOX', 'INKNOWN', 'INLIVWTH', 'INVISITS', 'INCALLS', 'INRELY', 'NACCFAM', 'NACCMOM', 'NACCDAD', 'NACCAM', 'NACCAMX', 'NACCAMS', 'NACCAMSX', 'NACCFM', 'NACCFMX', 'NACCFMS', 'NACCFMSX', 'NACCOM', 'NACCOMX', 'NACCOMS', 'NACCOMSX', 'NACCFADM', 'NACCFFTD', 'ANYMEDS', 'DRUG1', 'DRUG2', 'DRUG3', 'DRUG4', 'DRUG5', 'DRUG6', 'DRUG7', 'DRUG8', 'DRUG9', 'DRUG10', 'DRUG11', 'DRUG12', 'DRUG13', 'DRUG14', 'DRUG15', 'DRUG16', 'DRUG17', 'DRUG18'

In [51]:
# Assuming 'df_non_medical' is the DataFrame after removing specialist medical columns

# --- Target and Feature Separation ---
# Target Variable: DEMENTED 
Y = Dataset[['DEMENTED']]
X = Dataset.drop(columns=[['DEMENTED']], errors='ignore')

# --- Simple Feature Engineering (FAQ Score) ---
# Create the most powerful feature: the sum of the Functional Activities Questionnaire (FAQ).
faq_cols = ['BILLS', 'TAXES', 'SHOPPING', 'GAMES', 'STOVE', 
            'MEALPREP', 'EVENTS', 'PAYATTN', 'REMDATES', 'TRAVEL']
existing_faq_cols = [col for col in faq_cols if col in X.columns]

# Impute raw FAQ columns with 0, then sum them up.
X['FAQ_SCORE'] = X[existing_faq_cols].fillna(0).sum(axis=1)

# Remove the raw FAQ variables to prevent multicollinearity and simplify the final feature set
X = X.drop(columns=existing_faq_cols, errors='ignore')

display(X)

Unnamed: 0,NACCID,NACCADC,PACKET,FORMVER,VISITMO,VISITDAY,VISITYR,NACCVNUM,NACCAVST,NACCNVST,...,NPATGAM2,NPATGAM3,NPATGAM4,NPATGAM5,NPATGFRN,NPATGFR1,NPATGFR2,NPATGFR3,NPATGFR4,FAQ_SCORE
0,NACC002909,186,I,3.0,12,28,2022,1,2,2,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,5
1,NACC002909,186,F,3.0,1,23,2024,2,2,2,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,6
2,NACC003487,186,I,3.0,11,15,2023,1,1,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,0
3,NACC004352,186,I,3.0,10,5,2021,1,1,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,30
4,NACC004687,186,I,3.0,11,14,2022,1,1,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195191,NACC998475,9661,F,2.0,11,6,2008,3,3,3,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,12
195192,NACC999391,9661,I,1.0,3,2,2006,1,3,3,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,8
195193,NACC999391,9661,F,1.0,5,17,2007,2,3,3,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,13
195194,NACC999391,9661,F,2.0,3,27,2008,3,3,3,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,11


In [60]:
# --- Feature Type Definition ---
# Define columns based on their nature after engineering
id_columns_to_drop = [
    'NACCID', 'VISITID', 'NACCESID', 'DATE', 'EXAMDATE', 'VISITMO', 'VISITYR',
    'NACCID', # Primary NACC ID
    'VISITID', # Visit Identifier
    'NACCESID', # Enrollee ID
    'PACKSPER_X' # Any remaining text fields that were not explicitly dropped
]

# Drop them from your features X
X = X.drop(columns=id_columns_to_drop, errors='ignore')

# --- Feature Type Definition ---
# Define columns based on their nature after engineering

# Numeric features (will be imputed by MEDIAN and scaled)
numeric_features = [
    'BIRTHYR', 'EDUC', 'NACCAGEB', 'INBIRYR', 'INEDUC', 'INKNOWN', 
    'SMOKYRS', 'PACKSPER', 'ALCOCCAS', 'ALCFREQ', 'FAQ_SCORE' # The new feature
]

# Categorical features (will be imputed by MODE and one-hot encoded)
# The remaining columns after removing the numeric ones and the raw FAQ columns.
all_cols_remaining = X.columns.tolist()
categorical_features = [col for col in all_cols_remaining if col not in numeric_features]



In [61]:
# --- Train-Test Split ---
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

In [62]:
# --- Preprocessing Pipelines ---
# Impute NaNs with median, then scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Impute NaNs with mode, then One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

# --- Full Model Pipeline (Simple and Powerful) ---
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, max_depth=10, 
                                          class_weight='balanced', random_state=42)) 
])

# Train the model
print("Starting Model Training...")
model.fit(X_train, Y_train)
print("Model Training Complete.")

Starting Model Training...
Model Training Complete.


In [14]:
    !pip install shap



Collecting shap
  Downloading shap-0.50.0-cp313-cp313-win_amd64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.50.0-cp313-cp313-win_amd64.whl (549 kB)
   ---------------------------------------- 0.0/549.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/549.1 kB ? eta -:--:--
   ------------------- -------------------- 262.1/549.1 kB ? eta -:--:--
   ------------------- -------------------- 262.1/549.1 kB ? eta -:--:--
   -------------------------------------- 549.1/549.1 kB 630.7 kB/s eta 0:00:00
Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap

   -------------------- ------------------- 1/2 [shap]
   -------------------- ------------------- 1/2 [shap]
   -------------------- ------------------- 1/2 [shap]
   ---------------------------------------- 2/2 [shap]

Successfully installed shap-0.50.0 slicer-0.0.8


In [63]:
import shap # Required for explanation

# Get predictions on the unseen test set
Y_pred = model.predict(X_test)
Y_proba = model.predict_proba(X_test)[:, 1] # Probability for the DEMENTED class (1)

print("\n--- Model Performance Metrics ---")
print(f"1. ROC-AUC Score: {roc_auc_score(Y_test, Y_proba):.4f}")
print(f"2. F1 Score: {f1_score(Y_test, Y_pred):.4f}")

# Classification Report: Provides Precision, Recall, and F1-Score for each class
print("\n3. Classification Report:")
print(classification_report(Y_test, Y_pred, target_names=['Not at risk', 'At risk']))



--- Model Performance Metrics ---
1. ROC-AUC Score: 1.0000
2. F1 Score: 0.9862

3. Classification Report:
              precision    recall  f1-score   support

 Not at risk       1.00      0.99      0.99     27565
     At risk       0.97      1.00      0.99     11475

    accuracy                           0.99     39040
   macro avg       0.99      0.99      0.99     39040
weighted avg       0.99      0.99      0.99     39040

