In [2]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score
import numpy as np


In [None]:
# =============================================
# PART 0: Nested Cross-Validation for Model Selection
# =============================================


# Load balanced training data
data = pd.read_csv('training_balanced_data.csv')
X = data.drop(columns=['Outcome', 'Id'])
y = data['Outcome']

# define validation function 
def evaluate_model(model, X_test, y_test): # function takes trained model and test data
    y_pred = model.predict(X_test) # predict class on test data
    y_pred_proba = model.predict_proba(X_test) # predict class probabilities
    f1 = f1_score(y_test, y_pred, average='macro') # compute averaged macro f1 score
    roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro') # compute ROC AUC ovr score
    return f1, roc_auc # returns the matrics

# Define pipelines for each classifier
def create_xgb_pipeline(): # define pipeline function for xgb
    return Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)), # fill missing values using KNNImputer
        ('scaler', StandardScaler()), # feature scaling
        ('xgb', XGBClassifier(random_state=42, eval_metric='mlogloss')) # xgb classifier
    ])

def create_svm_pipeline(): # define pipeline for svm
    return Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)), # fix missing values using KNNImputer
        ('scaler', StandardScaler()), # feature scaling
        ('pca', PCA(n_components=100)), # reduces the dimensionality to 100 dimensions
        ('svm', SVC(probability=True, random_state=42)) # svm classifier with probabilities
    ])

def create_voting_pipeline(): # define pipline for voting ensemble
    rf = RandomForestClassifier(n_estimators=250, random_state=42) # random forest objects
    xgb_clf = XGBClassifier(random_state=42, eval_metric='mlogloss') # xgb objects
    lr = LogisticRegression(max_iter=1000, random_state=42) # lr objects
    voting = VotingClassifier(
        estimators=[('rf', rf), ('xgb', xgb_clf), ('lr', lr)],
        voting='soft'
    )
    return Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)),
        ('scaler', StandardScaler()),
        ('voting', voting)
    ])

def create_balanced_rf_pipeline(): # define pipeline for balanced random forest
    return Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)),
        ('scaler', StandardScaler()),
        ('balanced_rf', BalancedRandomForestClassifier(random_state=42)) # balanced random forest classifier
    ])

def create_smote_pipeline(): # define pipeline for xgb with smote oversampling
    return Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)), 
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)), # synthetic oversampling
        ('xgb', XGBClassifier(random_state=42, eval_metric='mlogloss')) # xgb classifier
    ])

def create_feature_selection_pipeline(): # define pipline for xgb with feature selection
    return Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)),
        ('scaler', StandardScaler()),
        ('feature_selection', SelectKBest(f_classif, k=min(500, X.shape[1]))), # select top 500 features
        ('xgb', XGBClassifier(random_state=42, eval_metric='mlogloss')) # xgb classifier
    ])

# naming the models
models = {
    'XGBoost': create_xgb_pipeline(), 
    'SVM': create_svm_pipeline(), 
    'Voting_Classifier': create_voting_pipeline(), 
    'Balanced_RF': create_balanced_rf_pipeline(),  
    'SMOTE_XGB': create_smote_pipeline(),
    'Feature_Selection_XGB': create_feature_selection_pipeline()
}

# Outer cv
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # 10 fold cv setting
outer_results = {} # save outer cv metrics 

print("Running nested cv for model comparison")

# Loop all models
for name, model in models.items(): 
    f1_scores = [] # save f1 scores 
    roc_scores = [] # save ROC AUC scores 
    
    # Outer cv split
    for train_idx, test_idx in outer_cv.split(X, y):
        X_train_cv, X_test_cv = X.iloc[train_idx], X.iloc[test_idx] # Training and validation features
        y_train_cv, y_test_cv = y.iloc[train_idx], y.iloc[test_idx] # Training and validation labels
        
        # fit model on training set
        model.fit(X_train_cv, y_train_cv)
        
        # evaluate model
        f1, roc = evaluate_model(model, X_test_cv, y_test_cv)
        f1_scores.append(f1) # evaluate f1 scores
        roc_scores.append(roc) # evaluate ROC AUC scores
    
    # save mean metrics across outer folds
    outer_results[name] = {
        'f1_mean': np.mean(f1_scores),
        'roc_mean': np.mean(roc_scores),
        
    }
    print(f"{name} - F1: {np.mean(f1_scores):.4f}, "
          f"ROC AUC: {np.mean(roc_scores):.4f}")

# Select best model based on F1 mean
best_model_name = max(outer_results, key=lambda k: outer_results[k]['f1_mean'])
print(f"\nBest model from nested CV: {best_model_name}")
best_model = models[best_model_name]


Running nested cv for model comparison


In [None]:
# =============================================
# PART 1: Hyperparameter Tuning of Selected Model
# =============================================

from sklearn.model_selection import GridSearchCV

# Example: tuning XGB if best_model_name contains "XGB", otherwise tune SVM
if 'XGB' in best_model_name:
    param_grid = {
        'xgb__n_estimators': [100, 200, 300],
        'xgb__max_depth': [3, 5, 7],
        'xgb__learning_rate': [0.01, 0.1, 0.2],
        'xgb__subsample': [0.8, 0.9, 1.0]
    }
elif 'SVM' in best_model_name:
    param_grid = {
        'pca__n_components': [50, 100, 150],
        'svm__C': [0.5, 1, 2, 5],
        'svm__gamma': ['scale', 0.01, 0.001]
    }
else:
    param_grid = {}  # For other pipelines, can add tuning later

if param_grid:
    print(f"Running hyperparameter tuning for {best_model_name}...")
    grid = GridSearchCV(
        best_model,
        param_grid,
        cv=5,
        scoring='f1_macro',
        n_jobs=-1,
        verbose=1
    )
    grid.fit(X, y)
    best_model = grid.best_estimator_
    print(f"Best hyperparameters: {grid.best_params_}")


In [None]:
# =============================================
# PART 2: Final Training on Full Dataset
# =============================================

print(f"\nTraining final {best_model_name} on all training data...")
best_model.fit(X, y)

print("Final model is ready to predict on new/unseen data.")


In [None]:
# =============================================
# PART 3: Predict on New Dataset
# =============================================

# Example:
new_data = pd.read_csv('new_dataset.csv')
X_new = new_data.drop(columns=['Id'], errors='ignore')  # keep only features

# Predictions
y_pred_new = best_model.predict(X_new)
y_pred_proba_new = best_model.predict_proba(X_new)

print(f"Predictions on new data completed. Shape: {y_pred_new.shape}")
