In [21]:
# Import additional dependencies
# Import dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
os.environ["KERAS_BACKEND"] = "torch"
import keras

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.utils import resample
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
# from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

from keras import layers, Input
from keras.models import Sequential
from keras.layers import Dense
from scikeras.wrappers import KerasClassifier

from imblearn.pipeline import Pipeline


import xgboost as xgb


from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import warnings
# warnings.filterwarnings('ignore')

# Load and prepare the data (assuming this part is already done)
# X_train, X_test, y_train, y_test should be available


In [22]:
# load balanced data
data = pd.read_csv('training_balanced_data.csv')
X = data.drop(columns=['Outcome', 'Id'])
y = data['Outcome']

#split in training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Step 1: Define evaluation function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    f1 = f1_score(y_test, y_pred, average='macro')
    roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')
    
    return f1, roc_auc

# Step 2: Enhanced preprocessing pipeline
def create_enhanced_preprocessing():
    preprocessing = Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)),
        ('scaler', StandardScaler())
    ])
    return preprocessing

# Step 3: Try different classifiers with optimized pipelines
def create_xgb_pipeline():
    preprocessing = create_enhanced_preprocessing()
    pipeline = Pipeline([
        ('preprocessing', preprocessing),
        ('xgb', XGBClassifier(random_state=42, eval_metric='mlogloss'))
    ])
    return pipeline

def create_svm_pipeline():
    preprocessing = create_enhanced_preprocessing()
    pipeline = Pipeline([
        ('preprocessing', preprocessing),
        ('pca', PCA(n_components=100)),
        ('svm', SVC(probability=True, random_state=42))
    ])
    return pipeline

def create_voting_pipeline():
    preprocessing = create_enhanced_preprocessing()
    
    # Individual classifiers
    rf = RandomForestClassifier(n_estimators=250, random_state=42)
    xgb = XGBClassifier(random_state=42, eval_metric='mlogloss')
    lr = LogisticRegression(random_state=42, max_iter=1000)
    
    # Voting classifier
    voting = VotingClassifier(
        estimators=[
            ('rf', rf),
            ('xgb', xgb),
            ('lr', lr)
        ],
        voting='soft'
    )
    
    pipeline = Pipeline([
        ('preprocessing', preprocessing),
        ('voting', voting)
    ])
    return pipeline

def create_balanced_rf_pipeline():
    preprocessing = create_enhanced_preprocessing()
    pipeline = Pipeline([
        ('preprocessing', preprocessing),
        ('balanced_rf', BalancedRandomForestClassifier(random_state=42))
    ])
    return pipeline

# Step 4: Hyperparameter tuning for the best performing model
def optimize_xgb_hyperparameters(X_train, y_train):
    preprocessing = create_enhanced_preprocessing()
    
    pipeline = Pipeline([
        ('preprocessing', preprocessing),
        ('xgb', XGBClassifier(random_state=42, eval_metric='mlogloss'))
    ])
    
    param_grid = {
        'xgb__n_estimators': [100, 200, 300],
        'xgb__max_depth': [3, 5, 7],
        'xgb__learning_rate': [0.01, 0.1, 0.2],
        'xgb__subsample': [0.8, 0.9, 1.0]
    }
    
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=10, scoring='f1_macro', 
        n_jobs=-2, verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_estimator_, grid_search.best_score_

# Step 5: Try SMOTE for handling class imbalance
def create_smote_pipeline():
    preprocessing = create_enhanced_preprocessing()
    pipeline = Pipeline([
        ('preprocessing', preprocessing),
        ('smote', SMOTE(random_state=42)),
        ('xgb', XGBClassifier(random_state=42, eval_metric='mlogloss'))
    ])
    return pipeline

# Step 6: Feature selection approach
def create_feature_selection_pipeline():
    preprocessing = create_enhanced_preprocessing()
    pipeline = Pipeline([
        ('preprocessing', preprocessing),
        ('feature_selection', SelectKBest(f_classif, k=500)),
        ('xgb', XGBClassifier(random_state=42, eval_metric='mlogloss'))
    ])
    return pipeline

# Step 7: Evaluate all models using cross-validation
def evaluate_all_models(X_train, y_train):
    models = {
        'XGBoost': create_xgb_pipeline(),
        'SVM': create_svm_pipeline(),
        'Voting_Classifier': create_voting_pipeline(),
        'Balanced_RF': create_balanced_rf_pipeline(),
        'SMOTE_XGB': create_smote_pipeline(),
        'Feature_Selection_XGB': create_feature_selection_pipeline()
    }
    
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    results = {}
    
    for name, model in models.items():
        print(f"Evaluating {name}...")
        
        # Cross-validation scores
        cv_scores_f1 = cross_val_score(model, X_train, y_train, 
                                     cv=cv, scoring='f1_macro', n_jobs=-1)
        cv_scores_roc = cross_val_score(model, X_train, y_train,
                                      cv=cv, scoring='roc_auc_ovr', n_jobs=-1)
        
        results[name] = {
            'f1_mean': cv_scores_f1.mean(),
            'f1_std': cv_scores_f1.std(),
            'roc_auc_mean': cv_scores_roc.mean(),
            'roc_auc_std': cv_scores_roc.std()
        }
        
        print(f"{name} - F1: {cv_scores_f1.mean():.4f} (±{cv_scores_f1.std():.4f}), "
              f"ROC AUC: {cv_scores_roc.mean():.4f} (±{cv_scores_roc.std():.4f})")
    
    return results, models

# Run evaluation
print("Evaluating all models...")
results, models = evaluate_all_models(X_train, y_train)

# Step 8: Hyperparameter optimization for the best model
print("\nOptimizing hyperparameters for XGBoost...")
best_xgb_model, best_score = optimize_xgb_hyperparameters(X_train, y_train)
print(f"Best cross-validation score: {best_score:.4f}")

# Add optimized model to our models dictionary
models['Optimized_XGB'] = best_xgb_model

# Step 9: Train final models on full training data and evaluate on test set
print("\nTraining final models on full training data...")
final_results = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    f1, roc_auc = evaluate_model(model, X_test, y_test)
    final_results[name] = {'f1': f1, 'roc_auc': roc_auc}
    print(f"{name} - Test F1: {f1:.4f}, Test ROC AUC: {roc_auc:.4f}")

# Step 10: Compare with previous models from steps 3, 5, 7, 9
# (Assuming we have these models saved as: model_step3, model_step5, model_step7, model_step9)

previous_models = {
    'Step3_Model': make_pipeline(SimpleImputer(), StandardScaler(), LogisticRegression(max_iter=1000, penalty = None)),  # Replace with actual model from step 3
    'Step5_Model': make_pipeline(StandardScaler(), KNNImputer(n_neighbors=3), LogisticRegression(C = 1)),  # Replace with actual model from step 5  
    'Step7_Model': make_pipeline(StandardScaler(), KNNImputer(n_neighbors=3), PCA(n_components=100), LogisticRegression(C = 1)),  # Replace with actual model from step 7
    'Step9_Model': make_pipeline(StandardScaler(), KNNImputer(n_neighbors=3), PCA(), LogisticRegression())   # Replace with actual model from step 9
}

print("\nEvaluating previous step models on test set...")
for name, model in previous_models.items():
    f1, roc_auc = evaluate_model(model, X_test, y_test)
    final_results[name] = {'f1': f1, 'roc_auc': roc_auc}
    print(f"{name} - Test F1: {f1:.4f}, Test ROC AUC: {roc_auc:.4f}")

# Step 11: Display final comparison
print("\n" + "="*50)
print("FINAL COMPARISON ON TEST SET")
print("="*50)

# Sort by F1 score
sorted_results = sorted(final_results.items(), key=lambda x: x[1]['f1'], reverse=True)

for name, scores in sorted_results:
    print(f"{name:20} - F1: {scores['f1']:.4f}, ROC AUC: {scores['roc_auc']:.4f}")

# Identify best model
best_model_name, best_scores = sorted_results[0]
print(f"\nBEST MODEL: {best_model_name}")
print(f"F1 Score: {best_scores['f1']:.4f}")
print(f"ROC AUC OvR: {best_scores['roc_auc']:.4f}")

Evaluating all models...
Evaluating XGBoost...


ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/bmlproj/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/bmlproj/lib/python3.11/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/bmlproj/lib/python3.11/site-packages/imblearn/pipeline.py", line 518, in fit
    Xt, yt = self._fit(X, y, routed_params, raw_params=params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/bmlproj/lib/python3.11/site-packages/imblearn/pipeline.py", line 400, in _fit
    self._validate_steps()
  File "/opt/anaconda3/envs/bmlproj/lib/python3.11/site-packages/imblearn/pipeline.py", line 289, in _validate_steps
    raise TypeError(
TypeError: All intermediate steps of the chain should not be Pipelines
