In [4]:
# 2_model_exploration [Target-Group].ipynb (Multi-Model Version)

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, f1_score

# Import all the models you want to test
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

# Load the processed data from Notebook 1
try:
    df = pd.read_csv('processed_data.csv')
except FileNotFoundError:
    print("Error: 'processed_data.csv' not found. Please run Notebook 1 first.")
    df = pd.DataFrame()

# --- Prepare Data for Modeling ---
# Target variable is 'CDR' (Clinical Dementia Rating). We make it a binary target 'Group'.
df['Group'] = (df['CDR'] > 0).astype(int)
df.drop('CDR', axis=1, inplace=True)

# Define features (X) and target (y)
X = df.drop(['Group', 'Subject ID'], axis=1)
y = df['Group']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

# --- Define Models and Hyperparameter Grids ---
# We will create a pipeline with scaling for each model

# 1. Support Vector Machine (SVM)
pipe_svc = Pipeline([('scaler', StandardScaler()), ('svc', SVC(probability=True, random_state=42))])
param_svc = {'svc__C': [0.1, 1, 10], 'svc__gamma': ['scale', 'auto'], 'svc__kernel': ['rbf', 'linear']}

# 2. Random Forest
pipe_rf = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier(random_state=42))])
param_rf = {'rf__n_estimators': [100, 200], 'rf__max_depth': [10, 20, None]}

# 3. Gradient Boosting
pipe_gb = Pipeline([('scaler', StandardScaler()), ('gb', GradientBoostingClassifier(random_state=42))])
param_gb = {'gb__n_estimators': [100, 200], 'gb__learning_rate': [0.05, 0.1], 'gb__max_depth': [3, 5]}

# 4. AdaBoost
pipe_ada = Pipeline([('scaler', StandardScaler()), ('ada', AdaBoostClassifier(random_state=42))])
param_ada = {'ada__n_estimators': [50, 100, 200], 'ada__learning_rate': [0.05, 0.1, 1.0]}

# 5. XGBoost
pipe_xgb = Pipeline([('scaler', StandardScaler()), ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))])
param_xgb = {'xgb__n_estimators': [100, 200], 'xgb__learning_rate': [0.05, 0.1], 'xgb__max_depth': [3, 5]}

# 6. Bagging Classifier
pipe_bag = Pipeline([('scaler', StandardScaler()), ('bag', BaggingClassifier(random_state=42))])
param_bag = {'bag__n_estimators': [50, 100, 200]}


# --- Run GridSearchCV for Each Model ---
pipelines = [pipe_svc, pipe_rf, pipe_gb, pipe_ada, pipe_xgb, pipe_bag]
params = [param_svc, param_rf, param_gb, param_ada, param_xgb, param_bag]
model_names = ['SVM', 'Random Forest', 'Gradient Boosting', 'AdaBoost', 'XGBoost', 'Bagging']
all_results = []

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, pipe in enumerate(pipelines):
    print(f"\n----- Tuning {model_names[i]} -----")
    grid_search = GridSearchCV(estimator=pipe, param_grid=params[i], cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    best_score = grid_search.best_score_
    best_params = grid_search.best_params_
    
    # Evaluate on the test set
    y_pred = grid_search.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred)

    print(f"Best CV Accuracy: {best_score:.4f}")
    print(f"Test Set Accuracy: {test_accuracy:.4f}")
    print(f"Best Parameters: {best_params}")

    # Store results for final report
    result = {
        'Model': model_names[i],
        'Best CV Accuracy': best_score,
        'Test Set Accuracy': test_accuracy,
        'Test Set F1 Score': test_f1,
        'Best Parameters': best_params
    }
    all_results.append(result)

# --- Save and Display Results ---
results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values(by='Test Set Accuracy', ascending=False)

print("\n\n" + "="*50)
print("          MODEL EXPLORATION SUMMARY")
print("="*50)
print(results_df.to_string())

# Save the detailed results for your research paper
results_df.to_csv('results/all_models_summary.csv', index=False)
print("\n\nFull summary saved to 'results/all_models_summary.csv'")

# Save the best model's configuration for Notebook 3
best_model_config = results_df.iloc[0]
best_model_config.to_json('results/best_model_config.json')
print(f"Configuration for the best model ({best_model_config['Model']}) saved for the final step.")

Training set shape: (606, 12)
Testing set shape: (203, 12)

----- Tuning SVM -----
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best CV Accuracy: 0.8680
Test Set Accuracy: 0.8768
Best Parameters: {'svc__C': 10, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}

----- Tuning Random Forest -----
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best CV Accuracy: 0.9010
Test Set Accuracy: 0.8916
Best Parameters: {'rf__max_depth': 10, 'rf__n_estimators': 100}

----- Tuning Gradient Boosting -----
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best CV Accuracy: 0.9010
Test Set Accuracy: 0.9015
Best Parameters: {'gb__learning_rate': 0.1, 'gb__max_depth': 5, 'gb__n_estimators': 100}

----- Tuning AdaBoost -----
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best CV Accuracy: 0.8713
Test Set Accuracy: 0.8768
Best Parameters: {'ada__learning_rate': 1.0, 'ada__n_estimators': 200}

----- Tuning XGBoost -----
Fitting 5 folds for each of 8 candidate