In [15]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score
import joblib
from imblearn.over_sampling import SMOTE

In [16]:
X_train_smote = joblib.load("D:\Study\Model X\Dementia Prediction\Data_Preprocessing\Artifacts\X_train_smote.pkl")
y_train_smote = joblib.load("D:\Study\Model X\Dementia Prediction\Data_Preprocessing\Artifacts\y_train_smote.pkl")
X_test = joblib.load("D:\Study\Model X\Dementia Prediction\Data_Preprocessing\Artifacts\X_test.pkl")
y_test = joblib.load("D:\Study\Model X\Dementia Prediction\Data_Preprocessing\Artifacts\y_test.pkl")

In [17]:
print("Training set shape:", X_train_smote.shape, y_train_smote.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Training set shape: (220168, 138) (220168,)
Test set shape: (39040, 138) (39040,)


In [18]:
rf = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
cat = CatBoostClassifier(verbose=0, random_state=42)

In [19]:
# Use StratifiedKFold to preserve class ratios
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [20]:
# Scoring function
scoring = {'F1': make_scorer(f1_score), 'ROC-AUC': 'roc_auc'}

In [21]:
# RandomForest parameter grid
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# XGBoost parameter grid
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0]
}

# CatBoost parameter grid
cat_param_grid = {
    'iterations': [200, 500],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5]
}

In [22]:
rf_grid = RandomizedSearchCV(
    estimator=rf,
    param_distributions=rf_param_grid,
    n_iter=10,
    scoring=make_scorer(f1_score), 
    cv=cv,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

rf_grid.fit(X_train_smote, y_train_smote)
print("Best RF params:", rf_grid.best_params_)
print("Best RF F1 score:", rf_grid.best_score_)
rf_best = rf_grid.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


KeyboardInterrupt: 

In [23]:
# Randomly sample 50k rows for faster tuning
X_train_sub = X_train_smote.sample(n=50000, random_state=42)
y_train_sub = y_train_smote.loc[X_train_sub.index]

In [24]:
cv_sub = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [25]:
rf = RandomForestClassifier(random_state=42)

rf_grid = RandomizedSearchCV(
    estimator=rf,
    param_distributions=rf_param_grid,
    n_iter=5,                # fewer combinations
    scoring=make_scorer(f1_score),
    cv=cv_sub,               # 3-fold CV
    verbose=2,
    n_jobs=-1,
    random_state=42
)

rf_grid.fit(X_train_sub, y_train_sub)

print("Best RF params:", rf_grid.best_params_)
print("Best RF F1 score:", rf_grid.best_score_)
rf_best = rf_grid.best_estimator_

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best RF params: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None}
Best RF F1 score: 0.9049676203175343


In [None]:
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0]
}

xgb_grid = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_param_grid,
    n_iter=5,
    scoring=f1_score,
    cv=cv_sub,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

xgb_grid.fit(X_train_sub, y_train_sub)
xgb_best = xgb_grid.best_estimator_
print("Best XGB params:", xgb_grid.best_params_)
print("Best XGB F1 score (subset CV):", xgb_grid.best_score_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best XGB params: {'subsample': 0.7, 'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
Best XGB F1 score (subset CV): 0.9188378328992343


In [None]:
# CatBoost
cat = CatBoostClassifier(verbose=0, random_state=42)

cat_param_grid = {
    'iterations': [200, 500],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5]
}

cat_grid = RandomizedSearchCV(
    estimator=cat,
    param_distributions=cat_param_grid,
    n_iter=5,
    scoring=f1_score,
    cv=cv_sub,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

cat_grid.fit(X_train_sub, y_train_sub)
cat_best = cat_grid.best_estimator_
print("Best CatBoost params:", cat_grid.best_params_)
print("Best CatBoost F1 score (subset CV):", cat_grid.best_score_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best CatBoost params: {'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 500, 'depth': 8}
Best CatBoost F1 score (subset CV): 0.9233223821520706


In [30]:
rf_best.fit(X_train_smote, y_train_smote)
xgb_best.fit(X_train_smote, y_train_smote)
cat_best.fit(X_train_smote, y_train_smote)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


<catboost.core.CatBoostClassifier at 0x1772ca14c50>

In [31]:
voting_clf_best = VotingClassifier(
    estimators=[('rf', rf_best), ('xgb', xgb_best), ('cat', cat_best)],
    voting='soft'
)

voting_clf_best.fit(X_train_smote, y_train_smote)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,estimators,"[('rf', ...), ('xgb', ...), ...]"
,voting,'soft'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.7
,device,
,early_stopping_rounds,
,enable_categorical,False


In [36]:
joblib.dump(voting_clf_best, "D:\Study\Model X\Dementia Prediction\Models/voting_clf_best.pkl")

['D:\\Study\\Model X\\Dementia Prediction\\Models/voting_clf_best.pkl']