### 1. Import Dependencies

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
import joblib
from sklearn.ensemble import VotingClassifier

### 2. Basic Processing

In [2]:
df = pd.read_csv(r"D:\Study\Model X\Dementia Prediction\Dataset\Processed\Feature_Engineered.csv")
df.head()

Unnamed: 0,BIRTHMO,BIRTHYR,SEX,EDUC,NACCAGE,NACCAGEB,INRELTO,TOBAC30,TOBAC100,SMOKYRS,...,INCONTF_2.0,ALCOHOL_0.0,ALCOHOL_1.0,ALCOHOL_2.0,ABUSOTHR_0.0.1,ABUSOTHR_1.0.1,ABUSOTHR_2.0.1,PSYCDIS_0.0.1,PSYCDIS_1.0.1,PSYCDIS_2.0.1
0,-0.142857,0.8125,1,0.0,-0.384615,-0.076923,1.0,0.0,0.0,0.0,...,False,True,False,False,True,False,False,True,False,False
1,-0.142857,0.8125,1,0.0,-0.307692,-0.076923,1.0,0.0,0.0,0.0,...,False,True,False,False,True,False,False,True,False,False
2,0.857143,1.0625,1,0.0,-0.692308,-0.384615,1.0,0.0,0.0,0.0,...,False,True,False,False,True,False,False,True,False,False
3,-0.714286,1.1875,0,0.0,-0.923077,-0.615385,1.0,0.0,0.0,0.0,...,False,True,False,False,True,False,False,True,False,False
4,-0.571429,0.375,1,-1.0,0.153846,0.461538,3.0,0.0,0.0,0.0,...,False,True,False,False,True,False,False,True,False,False


### 3. Modelling

In [3]:
# Features and target
X = df.drop(columns=['DEMENTED'])
y = df['DEMENTED']

In [4]:
# Define models
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42)
}

In [5]:
## Stratified K-Fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)

In [6]:
# Train and evaluate
results = {}

for name, model in models.items():
    roc_auc_scores = []
    f1_scores = []

    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        # ROC-AUC using probabilities
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_val)[:, 1]
        else:
            # For models like some SVMs without predict_proba
            y_prob = model.decision_function(X_val)
        
        roc_auc_scores.append(roc_auc_score(y_val, y_prob))
        f1_scores.append(f1_score(y_val, y_pred))

    results[name] = {
        'ROC-AUC': np.mean(roc_auc_scores),
        'F1-score': np.mean(f1_scores)
    }



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [7]:
results_df = pd.DataFrame(results).T
results_df.sort_values(by='ROC-AUC', ascending=False)

Unnamed: 0,ROC-AUC,F1-score
RandomForest,0.954114,0.853103
CatBoost,0.942323,0.837224
XGBoost,0.940766,0.834793
LogisticRegression,0.927924,0.826859
DecisionTree,0.845585,0.781939


In [12]:
# Define the top models with reasonable parameters
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
xgb = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', n_jobs=-1, random_state=42)
cat = CatBoostClassifier(iterations=200, verbose=0, thread_count=-1, random_state=42)

In [13]:
# Build the soft-voting ensemble
voting_clf = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('cat', cat)],
    voting='soft'  # use predicted probabilities
)

In [14]:
# Stratified K-Fold for cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [15]:
roc_auc_scores = []
f1_scores = []

for train_idx, val_idx in cv.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    voting_clf.fit(X_train, y_train)
    
    y_pred = voting_clf.predict(X_val)
    y_prob = voting_clf.predict_proba(X_val)[:,1]
    
    roc_auc_scores.append(roc_auc_score(y_val, y_prob))
    f1_scores.append(f1_score(y_val, y_pred))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [17]:
print("Voting Ensemble ROC-AUC:", np.mean(roc_auc_scores))
print("Voting Ensemble F1-score:", np.mean(f1_scores))

Voting Ensemble ROC-AUC: 0.9533663894037245
Voting Ensemble F1-score: 0.842289636314136


In [18]:
rf.fit(X, y)
xgb.fit(X, y)
cat.fit(X, y)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


<catboost.core.CatBoostClassifier at 0x209d24bc990>

In [None]:
joblib.dump(rf, f"D:\Study\Model X\Dementia Prediction\Models\\rf_model.pkl")
joblib.dump(xgb, f"D:\Study\Model X\Dementia Prediction\Models\\xgb_model.pkl")
joblib.dump(cat, f"D:\Study\Model X\Dementia Prediction\Models\\cat_model.pkl")

In [23]:
voting_clf.fit(X, y)  # fit the ensemble on the full dataset
joblib.dump(voting_clf, f"D:\Study\Model X\Dementia Prediction\Models\\VotingEnsemble_model.pkl")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


['D:\\Study\\Model X\\Dementia Prediction\\Models\\VotingEnsemble_model.pkl']