In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from xgboost import XGBClassifier



In [None]:
# XGB and SVM pipeline and training

# Load data
data = pd.read_csv('training_balanced_data.csv')
X = data.drop(columns=['Outcome', 'Id'])
y = data['Outcome']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Evaluation function
def evaluate_model(model, X_test, y_test): # evaluates trained model on test data
    y_pred = model.predict(X_test) 
    y_pred_proba = model.predict_proba(X_test) 
    f1 = f1_score(y_test, y_pred, average='macro')
    roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')
    
    return f1, roc_auc # returns macro f1 score and roc_auc score

# Pipelines functions
def create_xgb_pipeline(): # pipeline function for xgb
    return Pipeline([
        ('imputer', KNNImputer(n_neighbors=3)), # fill missing values using KNNImputer
        ('scaler', StandardScaler()), # feature scaling
        ('xgb', XGBClassifier(random_state=42, eval_metric='mlogloss')) # xgb classifier
    ])

def create_svm_pipeline(): # pipeline function for svm
    return Pipeline([ 
        ('imputer', KNNImputer(n_neighbors=3)), # fix missing values using KNNImputer
        ('scaler', StandardScaler()), # feature scaling
        ('pca', PCA(n_components=100)), # reduces the dimensionality to 100
        ('svm', SVC(probability=True, random_state=42)) # svm classifier with probability
    ])

# Cross-validation
def evaluate_all_models(X_train, y_train):

    models = { 
        'XGBoost': create_xgb_pipeline(), # create xgb pipeline
        'SVM': create_svm_pipeline() # create svm pipline
    }

# cross validation with 10 folds
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) 
    
    results = {} # store metrics for each model

# Loop evaluation for each model
    for name, model in models.items(): 
        print(f"Evaluating {name}") # print which model is currently evaluated
        
        try: # handy code for catching any errors during the process
            cv_scores_f1 = cross_val_score(model, X_train, y_train, cv=cv, # perform 10 fold cross validation for f1 score
                                           scoring='f1_macro', n_jobs=-2)
            cv_scores_roc = cross_val_score(model, X_train, y_train, cv=cv, # perform 10 fold cros validation for ROC AUC score
                                            scoring='roc_auc_ovr', n_jobs=-2)

            results[name] = { # store the mean of metrics in results
                'f1_mean': cv_scores_f1.mean(), 
                'roc_auc_mean': cv_scores_roc.mean(),
                
            }
            # print metric scores
            print(f"{name} - F1: {cv_scores_f1.mean():.4f}, "
                  f"ROC AUC: {cv_scores_roc.mean():.4f}")

        except Exception as e: # handy code for handeling errors. it continues with the next model
            print(f"Error evaluating {name}: {e}")

    return results

# model comparison 
print("Compare models\n")
results = evaluate_all_models(X_train, y_train) # gives the scores of each model

print("hello")
# Print summary of results
print("\nFinal cross-validation results:")
for model_name, metrics in results.items():
    print(f"{model_name:10} - F1: {metrics['f1_mean']:.4f}, "
          f"ROC AUC: {metrics['roc_auc_mean']:.4f} ")


Compare models

Evaluating XGBoost
XGBoost - F1: 0.8096, ROC AUC: 0.9515
Evaluating SVM
SVM - F1: 0.8728, ROC AUC: 0.9765

Final cross-validation results:
XGBoost    - F1: 0.8096, ROC AUC: 0.9515 
SVM        - F1: 0.8728, ROC AUC: 0.9765 


In [18]:

# Hyperparamter tuning for SVM

# SVM tuning pipeline
pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=3)), # fix missing values using KNNImputer
    ('scaler', StandardScaler()), # feature scaling
    ('pca', PCA()), # PCA, but now this will be scaled
    ('svm', SVC(probability=True, random_state=42)) # svm classifier with probabilities
])

# define hyperparameter grid 
param_grid = {
    'pca__n_components': [50, 100, 150], # number of PCA components to try
    'svm__C': [0.5, 1, 2, 5], # Controls the trade-off between a wider margin (low C) and correctly classifying all points (high C)
    'svm__gamma': ['scale', 0.01, 0.001] # Determines how far the influence of each data point reaches with high gamma fitting tightly to the data
}

# GridsearchCV setup
grid = GridSearchCV(
    pipeline, # pipeline to tune
    param_grid, # defined parameter combinations
    cv=10, # 10 fold Cross validation
    scoring='f1_macro', # test metric
    n_jobs=-2, # use all but one CPU cores
)

# train model for all parameter combinations
grid.fit(X_train, y_train)

# print best parameter 
print(grid.best_params_)

# print best cross validation f1 score
print(grid.best_score_)

# save best model
best_svm = grid.best_estimator_



{'pca__n_components': 150, 'svm__C': 2, 'svm__gamma': 'scale'}
0.873167734046109


In [19]:
# Train the best model on full training data

# Load data 
data = pd.read_csv('training_balanced_data.csv')
X = data.drop(columns=['Outcome', 'Id'])
y = data['Outcome']

# split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# train model with the best gridsearch parameter
best_svm.fit(X_train, y_train)

# Prediction of final model
y_pred = best_svm.predict(X_test)
y_pred_proba = best_svm.predict_proba(X_test)

# compute matrics
f1 = f1_score(y_test, y_pred, average='macro') # macro f1 score
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro') # ROC AUC ovr  score

print("\nFinal prediciton on test set:")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")




Final prediciton on test set:
F1 Score: 0.8942
ROC AUC: 0.9818
