In [1]:
# Import additional dependencies
# Import dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
os.environ["KERAS_BACKEND"] = "torch"
import keras

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.utils import resample
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
# from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

from keras import layers, Input
from keras.models import Sequential
from keras.layers import Dense
from scikeras.wrappers import KerasClassifier

from imblearn.pipeline import Pipeline


import xgboost as xgb


from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import warnings
# warnings.filterwarnings('ignore')

# Load and prepare the data (assuming this part is already done)
# X_train, X_test, y_train, y_test should be available


In [2]:
# load balanced data
data = pd.read_csv('training_balanced_data.csv')
X = data.drop(columns=['Outcome', 'Id'])
y = data['Outcome']

#split in training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Step 1: Define evaluation function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    f1 = f1_score(y_test, y_pred, average='macro')
    roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')
    
    return f1, roc_auc

# Step 2: Enhanced preprocessing pipeline - FIXED: Use regular sklearn pipeline for preprocessing
def create_enhanced_preprocessing():
    preprocessing = make_pipeline(
        KNNImputer(n_neighbors=3),
        StandardScaler()
    )
    return preprocessing

# Step 3: Try different classifiers with optimized pipelines - FIXED: Use imblearn Pipeline directly
def create_xgb_pipeline():
    pipeline = Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)),
        ('scaler', StandardScaler()),
        ('xgb', XGBClassifier(random_state=42, eval_metric='mlogloss'))
    ])
    return pipeline

def create_svm_pipeline():
    pipeline = Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)),
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('svm', SVC(probability=True, random_state=42))
    ])
    return pipeline

def create_voting_pipeline():
    # Individual classifiers
    rf = RandomForestClassifier(n_estimators=250, random_state=42)
    xgb_clf = XGBClassifier(random_state=42, eval_metric='mlogloss')
    lr = LogisticRegression(random_state=42, max_iter=1000)
    
    # Voting classifier
    voting = VotingClassifier(
        estimators=[
            ('rf', rf),
            ('xgb', xgb_clf),
            ('lr', lr)
        ],
        voting='soft'
    )
    
    pipeline = Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)),
        ('scaler', StandardScaler()),
        ('voting', voting)
    ])
    return pipeline

def create_balanced_rf_pipeline():
    pipeline = Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)),
        ('scaler', StandardScaler()),
        ('balanced_rf', BalancedRandomForestClassifier(random_state=42))
    ])
    return pipeline

# Step 4: Hyperparameter tuning for the best performing model
def optimize_xgb_hyperparameters(X_train, y_train):
    pipeline = Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)),
        ('scaler', StandardScaler()),
        ('xgb', XGBClassifier(random_state=42, eval_metric='mlogloss'))
    ])
    
    param_grid = {
        'xgb__n_estimators': [100, 200, 300],
        'xgb__max_depth': [3, 5, 7],
        'xgb__learning_rate': [0.01, 0.1, 0.2],
        'xgb__subsample': [0.8, 0.9, 1.0]
    }
    
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=5, scoring='f1_macro',  # Reduced cv from 10 to 5 for speed
        n_jobs=-1, verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_estimator_, grid_search.best_score_

# Step 5: Try SMOTE for handling class imbalance
def create_smote_pipeline():
    pipeline = Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)),
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('xgb', XGBClassifier(random_state=42, eval_metric='mlogloss'))
    ])
    return pipeline

# Step 6: Feature selection approach
def create_feature_selection_pipeline():
    pipeline = Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)),
        ('scaler', StandardScaler()),
        ('feature_selection', SelectKBest(f_classif, k=min(500, X_train.shape[1]))),  # Ensure k doesn't exceed features
        ('xgb', XGBClassifier(random_state=42, eval_metric='mlogloss'))
    ])
    return pipeline

# Step 7: Evaluate all models using cross-validation
def evaluate_all_models(X_train, y_train):
    models = {
        'XGBoost': create_xgb_pipeline(),
        'SVM': create_svm_pipeline(),
        'Voting_Classifier': create_voting_pipeline(),
        'Balanced_RF': create_balanced_rf_pipeline(),
        'SMOTE_XGB': create_smote_pipeline(),
        'Feature_Selection_XGB': create_feature_selection_pipeline()
    }
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Reduced from 10 to 5 for speed
    results = {}
    
    for name, model in models.items():
        print(f"Evaluating {name}...")
        
        try:
            # Cross-validation scores
            cv_scores_f1 = cross_val_score(model, X_train, y_train, 
                                         cv=cv, scoring='f1_macro', n_jobs=-1)
            cv_scores_roc = cross_val_score(model, X_train, y_train,
                                          cv=cv, scoring='roc_auc_ovr', n_jobs=-1)
            
            results[name] = {
                'f1_mean': cv_scores_f1.mean(),
                'f1_std': cv_scores_f1.std(),
                'roc_auc_mean': cv_scores_roc.mean(),
                'roc_auc_std': cv_scores_roc.std()
            }
            
            print(f"{name} - F1: {cv_scores_f1.mean():.4f} (±{cv_scores_f1.std():.4f}), "
                  f"ROC AUC: {cv_scores_roc.mean():.4f} (±{cv_scores_roc.std():.4f})")
        except Exception as e:
            print(f"Error evaluating {name}: {e}")
            results[name] = {
                'f1_mean': 0,
                'f1_std': 0,
                'roc_auc_mean': 0,
                'roc_auc_std': 0
            }
    
    return results, models

# Run evaluation
print("Evaluating all models...")
results, models = evaluate_all_models(X_train, y_train)

# Step 8: Hyperparameter optimization for the best model
print("\nOptimizing hyperparameters for XGBoost...")
try:
    best_xgb_model, best_score = optimize_xgb_hyperparameters(X_train, y_train)
    print(f"Best cross-validation score: {best_score:.4f}")
    # Add optimized model to our models dictionary
    models['Optimized_XGB'] = best_xgb_model
except Exception as e:
    print(f"Hyperparameter optimization failed: {e}")
    # Use default XGBoost as fallback
    models['Optimized_XGB'] = create_xgb_pipeline()

# Step 9: Train final models on full training data and evaluate on test set
print("\nTraining final models on full training data...")
final_results = {}

for name, model in models.items():
    print(f"Training {name}...")
    try:
        model.fit(X_train, y_train)
        f1, roc_auc = evaluate_model(model, X_test, y_test)
        final_results[name] = {'f1': f1, 'roc_auc': roc_auc}
        print(f"{name} - Test F1: {f1:.4f}, Test ROC AUC: {roc_auc:.4f}")
    except Exception as e:
        print(f"Error training {name}: {e}")
        final_results[name] = {'f1': 0, 'roc_auc': 0}

# Step 10: Compare with previous models from steps 3, 5, 7, 9
# Create simple baseline models for comparison
print("\nEvaluating baseline models on test set...")

# Simple baseline models
baseline_models = {
    'Baseline_LR': make_pipeline(
        SimpleImputer(strategy='mean'), 
        StandardScaler(), 
        LogisticRegression(max_iter=1000, random_state=42)
    ),
    'Baseline_RF': make_pipeline(
        SimpleImputer(strategy='mean'),
        StandardScaler(),
        RandomForestClassifier(n_estimators=100, random_state=42)
    ),
    'Baseline_XGB': make_pipeline(
        SimpleImputer(strategy='mean'),
        StandardScaler(),
        XGBClassifier(random_state=42, eval_metric='mlogloss')
    )
}

for name, model in baseline_models.items():
    try:
        model.fit(X_train, y_train)
        f1, roc_auc = evaluate_model(model, X_test, y_test)
        final_results[name] = {'f1': f1, 'roc_auc': roc_auc}
        print(f"{name} - Test F1: {f1:.4f}, Test ROC AUC: {roc_auc:.4f}")
    except Exception as e:
        print(f"Error with {name}: {e}")
        final_results[name] = {'f1': 0, 'roc_auc': 0}

# Step 11: Display final comparison
print("\n" + "="*50)
print("FINAL COMPARISON ON TEST SET")
print("="*50)

# Sort by F1 score
sorted_results = sorted(final_results.items(), key=lambda x: x[1]['f1'], reverse=True)

for name, scores in sorted_results:
    print(f"{name:25} - F1: {scores['f1']:.4f}, ROC AUC: {scores['roc_auc']:.4f}")

# Identify best model
if sorted_results:
    best_model_name, best_scores = sorted_results[0]
    print(f"\nBEST MODEL: {best_model_name}")
    print(f"F1 Score: {best_scores['f1']:.4f}")
    print(f"ROC AUC OvR: {best_scores['roc_auc']:.4f}")
    
    # Save the best model
    best_model = None
    if best_model_name in models:
        best_model = models[best_model_name]
    elif best_model_name in baseline_models:
        best_model = baseline_models[best_model_name]
    
    if best_model is not None:
        print(f"Best model '{best_model_name}' is ready for use!")
else:
    print("No models were successfully trained.")

Evaluating all models...
Evaluating XGBoost...
XGBoost - F1: 0.7857 (±0.0126), ROC AUC: 0.9481 (±0.0088)
Evaluating SVM...
SVM - F1: 0.8571 (±0.0195), ROC AUC: 0.9736 (±0.0095)
Evaluating Voting_Classifier...
Voting_Classifier - F1: 0.8217 (±0.0214), ROC AUC: 0.9596 (±0.0085)
Evaluating Balanced_RF...
Balanced_RF - F1: 0.6807 (±0.0180), ROC AUC: 0.8704 (±0.0081)
Evaluating SMOTE_XGB...
SMOTE_XGB - F1: 0.7856 (±0.0266), ROC AUC: 0.9471 (±0.0101)
Evaluating Feature_Selection_XGB...
Feature_Selection_XGB - F1: 0.8070 (±0.0246), ROC AUC: 0.9552 (±0.0087)

Optimizing hyperparameters for XGBoost...
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best cross-validation score: 0.8314

Training final models on full training data...
Training XGBoost...
XGBoost - Test F1: 0.8275, Test ROC AUC: 0.9658
Training SVM...
SVM - Test F1: 0.8759, Test ROC AUC: 0.9773
Training Voting_Classifier...
Voting_Classifier - Test F1: 0.8753, Test ROC AUC: 0.9721
Training Balanced_RF...
Balanced_RF - 

In [None]:

# Load data

data = pd.read_csv('training_balanced_data.csv')
X = data.drop(columns=['Outcome', 'Id'])
y = data['Outcome']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# Evaluation function

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    f1 = f1_score(y_test, y_pred, average='macro')
    roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')
    
    return f1, roc_auc


# Pipelines

def create_xgb_pipeline():
    return Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)),
        ('scaler', StandardScaler()),
        ('xgb', XGBClassifier(random_state=42, eval_metric='mlogloss'))
    ])

def create_svm_pipeline():
    return Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)),
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('svm', SVC(probability=True, random_state=42))
    ])


# Hyperparameter optimisation (XGB)

def optimize_xgb_hyperparameters(X_train, y_train):

    pipeline = Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)),
        ('scaler', StandardScaler()),
        ('xgb', XGBClassifier(random_state=42, eval_metric='mlogloss'))
    ])
    
    param_grid = {
        'xgb__n_estimators': [100, 200, 300], # number of trees in the ensemble
        'xgb__max_depth': [3, 5, 7], # max depth of each tree
        'xgb__learning_rate': [0.01, 0.1, 0.2], # Step size shrinkage used to prevent overfitting
        'xgb__subsample': [0.8, 0.9, 1.0] # reduces overfitting and introduces randomness like bagging
    }
    
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=5, scoring='f1_macro',
        n_jobs=-1, verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_score_


# Models 

def evaluate_all_models(X_train, y_train):

    models = {
        'XGBoost': create_xgb_pipeline(),
        'SVM': create_svm_pipeline()
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    results = {}
    
    for name, model in models.items():
        print(f"Evaluating {name}...")
        try:
            cv_scores_f1 = cross_val_score(
                model, X_train, y_train, cv=cv,
                scoring='f1_macro', n_jobs=-1
            )
            cv_scores_roc = cross_val_score(
                model, X_train, y_train, cv=cv,
                scoring='roc_auc_ovr', n_jobs=-1
            )
            
            results[name] = {
                'f1_mean': cv_scores_f1.mean(),
                'f1_std': cv_scores_f1.std(),
                'roc_auc_mean': cv_scores_roc.mean(),
                'roc_auc_std': cv_scores_roc.std()
            }
            
            print(f"{name} - F1: {cv_scores_f1.mean():.4f} (±{cv_scores_f1.std():.4f}), "
                  f"ROC AUC: {cv_scores_roc.mean():.4f} (±{cv_scores_roc.std():.4f})")
        
        except Exception as e:
            print(f"Error evaluating {name}: {e}")
            results[name] = {
                'f1_mean': 0, 'f1_std': 0,
                'roc_auc_mean': 0, 'roc_auc_std': 0
            }
    
    return results, models



# Run models 

print("Evaluating all models...")
results, models = evaluate_all_models(X_train, y_train)


# Hyperparam optimisation for XGB

print("\nOptimizing hyperparameters for XGBoost...")

try:
    best_xgb_model, best_score = optimize_xgb_hyperparameters(X_train, y_train)
    print(f"Best cross-validation score: {best_score:.4f}")
    models['Optimized_XGB'] = best_xgb_model

except Exception as e:
    print(f"Hyperparameter optimization failed: {e}")
    models['Optimized_XGB'] = create_xgb_pipeline()


# Final training and test

print("\nTraining final models on full training data...")
final_results = {}

for name, model in models.items():
    print(f"Training {name}...")
    try:
        model.fit(X_train, y_train)
        f1, roc_auc = evaluate_model(model, X_test, y_test)
        
        final_results[name] = {'f1': f1, 'roc_auc': roc_auc}
        
        print(f"{name} - Test F1: {f1:.4f}, Test ROC AUC: {roc_auc:.4f}")

    except Exception as e:
        print(f"Error training {name}: {e}")
        final_results[name] = {'f1': 0, 'roc_auc': 0}


# Comparison

print("\n" + "="*50)
print("Final comparision on test set")
print("="*50)

sorted_results = sorted(final_results.items(), key=lambda x: x[1]['f1'], reverse=True)

for name, scores in sorted_results:
    print(f"{name:25} - F1: {scores['f1']:.4f}, ROC AUC: {scores['roc_auc']:.4f}")

if sorted_results:
    best_model_name, best_scores = sorted_results[0]
    print(f"\nBEST MODEL: {best_model_name}")
    print(f"F1 Score: {best_scores['f1']:.4f}")
    print(f"ROC AUC OvR: {best_scores['roc_auc']:.4f}")

    if best_model_name in models:
        best_model = models[best_model_name]
    else:
        best_model = baseline_models[best_model_name]

    print(f"Best model '{best_model_name}' is ready for use!")


Evaluating all models...
Evaluating XGBoost...
XGBoost - F1: 0.7857 (±0.0126), ROC AUC: 0.9481 (±0.0088)
Evaluating SVM...
SVM - F1: 0.8570 (±0.0200), ROC AUC: 0.9741 (±0.0086)

Optimizing hyperparameters for XGBoost...
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best cross-validation score: 0.8314

Training final models on full training data...
Training XGBoost...
XGBoost - Test F1: 0.8275, Test ROC AUC: 0.9658
Training SVM...
SVM - Test F1: 0.8813, Test ROC AUC: 0.9778
Training Optimized_XGB...
Optimized_XGB - Test F1: 0.8773, Test ROC AUC: 0.9743

Evaluating baseline models on test set...
these are not executed

FINAL COMPARISON ON TEST SET
SVM                       - F1: 0.8813, ROC AUC: 0.9778
Optimized_XGB             - F1: 0.8773, ROC AUC: 0.9743
XGBoost                   - F1: 0.8275, ROC AUC: 0.9658

BEST MODEL: SVM
F1 Score: 0.8813
ROC AUC OvR: 0.9778
Best model 'SVM' is ready for use!
