# MSc Simple Multi-Objective AutoML Pipeline

First I will look at 2 sets of data, 1 binary classification and one multiclass. I will do a complete pipeline and evaluation that will later be automated. That pipeline will include:

* Data Preprocessing
* Model Training
* Hyperparameter optimization.
* Model Evaluation with Accuracy
* Model Evaluation with Interpretability (specified metrics)
* Incorporating into a ranking function for model selection

Each automation will be constrained.

<img src="images\Pipeline.png"/>

Algorithms that will be used for modelling will be limited to:

1. Logistic Regression
2. Decision Trees
3. Random Forest
4. LightGBM
5. SVM

## Full Pipeline

In [7]:
# Importing the libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import loguniform, randint, uniform

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
import shap

# Import the algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier

In [16]:
# Filter warnings
import warnings

warnings.filterwarnings(
    "ignore",
    message="X does not have valid feature names",
    category=UserWarning
)

### Model and Hyperparameter grids

In [58]:
# Define models
models = {
    "logreg": LogisticRegression(max_iter=5000),
    "dt": DecisionTreeClassifier(),
    "rf": RandomForestClassifier(),
    "svm": SVC(probability=True),
    "lgb": LGBMClassifier(),
}

# Define parameter space

param_grids = {
    "logreg" : [ #Not all solvers are compatible with each type of regularization. so an list of dictionaries is implemented to enforce compatibility.
        {   # lbfgs: supports l2
            "model__solver": ["lbfgs"],
            "model__penalty": ["l2"],
            "model__C": loguniform(1e-4, 1e2),
            "model__class_weight": [None, "balanced"],
        },
        {   # saga: supports elasticnet (needs l1_ratio)
            "model__solver": ["saga"],
            "model__penalty": ["elasticnet"],
            "model__C": loguniform(1e-4, 1e2),
            "model__l1_ratio": [0.1, 0.5, 0.9], #only used for elasticnet
            "model__class_weight": [None, "balanced"],
        },
        {   # saga can do l1 / l2 as well
            "model__solver": ["saga"],
            "model__penalty": ["l1", "l2"],
            "model__C": loguniform(1e-4, 1e2),
            "model__class_weight": [None, "balanced"],
        },
        ],

    "dt" : {
        "model__max_depth": randint(1, 20), 
        "model__min_samples_split": randint(2, 20), 
        "model__min_samples_leaf": randint(1, 20),
        "model__class_weight": [None, "balanced"],
    },
    "rf" : {
        "model__n_estimators": randint(100, 500),
        "model__max_depth": randint(1, 20),
        "model__min_samples_split": randint(2, 20),
        "model__class_weight": [None, "balanced", "balanced_subsample"],
    },
    "svm" : [
        {
            "model__kernel": ["linear"],
            "model__C": loguniform(1e-3, 1e2),
            "model__class_weight": [None, "balanced"],
        },
        {
            "model__kernel": ["rbf"],
            "model__C": loguniform(1e-3, 1e2),
            "model__gamma": ["scale", "auto"],
            "model__class_weight": [None, "balanced"],
        }
    ],
    "lgb" : {
        "model__num_leaves": randint(31, 60),       # controls tree complexity
        "model__max_depth": randint(5, 15),         # limits tree depth
        "model__learning_rate": loguniform(1e-3, 0.1), # step size
        "model__n_estimators": randint(100, 500),   # number of boosting rounds
        "model__subsample": uniform(0.7, 0.3),      # row sampling
        "model__colsample_bytree": uniform(0.7, 0.3), # feature sampling
        "model__class_weight": [None, "balanced"], # to handle class imbalance
    }
}

### Import and Split Data

Telco Customer Churn:

https://www.kaggle.com/datasets/blastchar/telco-customer-churn 

In [None]:
# Import dataset
churn_data = pd.read_csv("Datasets/Telco-Customer-Churn.csv")

# Split the data into training and test sets

le = LabelEncoder()

y = le.fit_transform(churn_data["Churn"]) # encode category labels
X = churn_data.drop("Churn", axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((5634, 20), (5634,), (1409, 20), (1409,))

In [45]:
type(X_test)

pandas.core.frame.DataFrame

In [44]:
type(y_test)

numpy.ndarray

### Define Transformer class for pipeline

In [59]:
# Create a transformer class to group features in and drop columns

from sklearn.base import BaseEstimator, TransformerMixin

class GroupFeatures(BaseEstimator, TransformerMixin):

    # Constructor method
    def __init__(self, max_unique=20):
        self.max_unique = max_unique

    #Fit method learns column groupings and detects high cardinality --> metadata for future steps
    def fit(self, X, y=None):
        categorical = X.select_dtypes(include=["object", "string", "category"])
        numeric = X.select_dtypes(include="number")
        boolean = X.select_dtypes(include="bool")

        high_cardinality = categorical.nunique() > self.max_unique #Detects high cardinality above maximum value specified

        self.numeric_columns_ = numeric.columns.union(boolean.columns).tolist() #Creates numerical column list
        self.categorical_columns_ = high_cardinality[~high_cardinality].index.tolist() #Creates categorical column list - high cardinality features

        self.keep_columns_ = self.numeric_columns_ + self.categorical_columns_

        return self

    # passes through the data untransformed 
    def transform(self, X):
        return X[self.keep_columns_]

### Define Preprocessor

In [9]:
# ColumnTransformer for feature preprocessing
from sklearn.compose import make_column_selector

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), make_column_selector(dtype_include="number")), 
        ("cat", OneHotEncoder(handle_unknown="ignore"), make_column_selector(dtype_include=["object", "string", "category"])),
    ],
    remainder="drop"
).set_output(transform="default")

### Define Function for Hyperparameter Optimization using RandomizedSearchCV

In [10]:
# Function for randomized search cross validation search

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline

def run_randomized_search(
    models,
    param_grids,
    X_train,
    y_train,
    preprocessor,
    feature_grouper_cls=GroupFeatures,
    feature_grouper_kwargs={"max_unique": 20},
    n_iter=5,
    n_splits=5,
    scoring=None,
    refit_metric="roc_auc",
    random_state=42,
    n_jobs=-1,
    verbose=True,
):
    """
    Runs RandomizedSearchCV for multiple models wrapped in a pipeline.

    Returns
    -------
    results : dict
        {model_name: fitted RandomizedSearchCV object}
    """

    if scoring is None:
        scoring = {
            "accuracy": "accuracy",
            "f1": "f1_weighted",
            "roc_auc": "roc_auc_ovr",
        }

    cv = StratifiedKFold(
        n_splits=n_splits,
        shuffle=True,
        random_state=random_state
    )

    results = {}

    for name, model in models.items():

        pipeline = Pipeline([
            ("features", feature_grouper_cls(**feature_grouper_kwargs)),
            ("preprocess", preprocessor),
            ("model", model),
        ])

        clf = RandomizedSearchCV(
            estimator=pipeline,
            param_distributions=param_grids[name],
            n_iter=n_iter,
            scoring=scoring,
            cv=cv,
            n_jobs=n_jobs,
            random_state=random_state,
            refit=refit_metric,
        )

        search = clf.fit(X_train, y_train)
        results[name] = search

        if verbose:
            best_idx = search.best_index_

            print("\n==============================")
            print(f"MODEL: {name}")
            print("==============================")
            print("Best Params:", search.best_params_)
            print(f"Best ROC-AUC:  {search.cv_results_['mean_test_roc_auc'][best_idx]:.4f}")
            print(f"Best Accuracy:{search.cv_results_['mean_test_accuracy'][best_idx]:.4f}")
            print(f"Best F1:      {search.cv_results_['mean_test_f1'][best_idx]:.4f}")

    return results

### Define function for model evaluation on test set

In [35]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

def evaluate_model_on_test(best_pipeline, X_test, y_test):
    y_pred = best_pipeline.predict(X_test)
    y_proba = best_pipeline.predict_proba(X_test)
    
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "f1_weighted": f1_score(y_test, y_pred, average="weighted"), # keeping multiclass metrics
    }

    print("Unique classes in y_test:", np.unique(y_test))
    print("Shape of y_proba:", y_proba.shape)
    print("Classes learned by model:", best_pipeline.named_steps["model"].classes_)

    n_classes = len(best_pipeline.named_steps["model"].classes_)

    # ROC-AUC handling binary and multiclass
    n_classes = len(np.unique(y_test))
    if n_classes == 2:
        metrics["roc_auc"] = roc_auc_score(y_test, y_proba[:, 1]) # Binary
    else:
        metrics["roc_auc"] = roc_auc_score(y_test, y_proba, multi_class="ovo") # Multiclass
    
    return metrics, y_pred, y_proba

### Define SHAP stability function

In [12]:
def shap_stability_perturbations(
    best_pipeline,
    X_train,
    X_test,
    numerical_columns,
    n_background=30, #100-200
    n_samples=50, #200-300
    n_perturb=10, # 30-50
    noise_level=0.02,   # 2% of train std
    random_state=42
):
    
    # Define cosine similarity inside function
    def cosine_similarity(a, b, eps=1e-12):
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + eps)
    
    # Technically not needed with current model setup, however will keep for models without predict_proba
    def model_output_fn(model):
        return model.predict_proba   # returns shape: (n_samples, n_classes)

    rng = np.random.default_rng(random_state)

    # Extract feature names from the pipeline for future conversion of transformed arrays back into DataFrames 
    feature_names = best_pipeline.named_steps["preprocess"].get_feature_names_out()

    # Transform background data for masker
    X_train_background_raw = shap.sample(X_train, n_background, random_state=random_state) #first sample from training data so that preprocessing is done on fewer instances
    X_train_background_feat = best_pipeline.named_steps["features"].transform(X_train_background_raw)
    X_train_background_trans = best_pipeline.named_steps["preprocess"].transform(X_train_background_feat)

    #Create Masker
    masker = shap.maskers.Independent(X_train_background_trans)

    #Define final fitted model
    model = best_pipeline.named_steps["model"]
        
    #Create SHAP explainer
    explainer = shap.PermutationExplainer(model_output_fn(model), masker)

    # Prepare test samples 
    X_explain_raw = shap.sample(X_test, n_samples, random_state=random_state)
    X_explain_feat = best_pipeline.named_steps["features"].transform(X_explain_raw)
    X_explain_trans = best_pipeline.named_steps["preprocess"].transform(X_explain_feat)

    # Calculate original SHAP values
    shap_orig = explainer(X_explain_trans).values  # shape: (n_samples, n_features, n_classes)

    # Compute standard deviation for numeric columns 
    train_stds = X_train[numerical_columns].std(ddof=0).replace(0, 1e-12)

    # Perturb each sample K times, recompute SHAP, compute stability

    stability_scores = []

    for i in range(len(X_explain_raw)):
        x0 = X_explain_raw.iloc[i:i+1].copy() # make a copy of the sample

        # select the original shap vector for sample i
        s0 = shap_orig[i] # Shape: (n_features, n_classes)

        similarity_scores = []

        for _ in range(n_perturb):
            x_pert = x0.copy()

            # Ensuring numerical columns are float for compatible dtype when adding noise
            x_pert[numerical_columns] = x_pert[numerical_columns].astype(float)

            # add a small amount of noise to all numeric columns at once
            noise = rng.normal(loc=0.0, scale=noise_level * train_stds.values, size=(1, len(numerical_columns)))
            x_pert.loc[:, numerical_columns] = x_pert[numerical_columns].values + noise

            # Transform perturbed sample
            x_pert_feat = best_pipeline.named_steps["features"].transform(x_pert)
            x_pert_trans = best_pipeline.named_steps["preprocess"].transform(x_pert_feat)

            # calculate shap for perturbed sample
            shap_pert = explainer(x_pert_trans).values[0]

            class_similarities = []

            for c in range(s0.shape[1]):
                similarity = cosine_similarity(s0[:,c], shap_pert[:,c])
                class_similarities.append((similarity + 1) / 2) # appending class similarity to list scaled between 0 and 1
        
            similarity_scores.append(np.mean(class_similarities)) #appends mean of similarity scores across classes to similarity scores

        # calculate the average stability score
        stability_scores.append(np.mean(similarity_scores))
    
    print(f"{model.__class__.__name__} SHAP analysis complete.")
    
    return np.array(stability_scores)

### Define Multi-Objective scoring function

In [13]:
# define scoring function
def multi_objective_score(performance, SHAP_stab, perf_ratio=0.5):
    assert 0 <= performance <= 1
    assert 0 <= SHAP_stab <= 1
    assert 0 <= perf_ratio <= 1
    return perf_ratio * performance + (1-perf_ratio) * SHAP_stab

### Implement full pipeline

In [14]:
results = run_randomized_search(
    models=models,
    param_grids=param_grids,
    X_train=X_train,
    y_train=y_train,
    preprocessor=preprocessor,
    feature_grouper_cls=GroupFeatures,
    feature_grouper_kwargs={"max_unique": 20},
    n_iter=5,
    n_splits=5,
    scoring=None,
    refit_metric="roc_auc",
    random_state=42,
    n_jobs=-1,
    verbose=True,
)

results_dict = {}

for name, search in results.items():

    best_pipeline = search.best_estimator_
    grouper = best_pipeline.named_steps["features"]

    numerical_columns = grouper.numeric_columns_
    categorical_columns = grouper.categorical_columns_
    keep_columns = grouper.keep_columns_

    metrics, y_pred, y_proba = evaluate_model_on_test(
        best_pipeline=best_pipeline,
        X_test=X_test,
        y_test=y_test)

    stability = shap_stability_perturbations(
        best_pipeline,
        X_train,
        X_test,
        numerical_columns=numerical_columns,
        n_background=30,
        n_samples=50,
        n_perturb=10,
        noise_level=0.02,   
        random_state=42
        )
    
    mean_stability = stability.mean()

    ranking_score = multi_objective_score(performance=metrics['roc_auc'], SHAP_stab=mean_stability)

    results_dict[name] = {
        "metrics": metrics,
        "y_pred": y_pred,
        "y_proba": y_proba,
        "stability": stability,
        "mean_stability": mean_stability,
        "std_stability": stability.std(),
        "ranking_score": ranking_score
        }




MODEL: logreg
Best Params: {'model__C': np.float64(0.39079671568228835), 'model__class_weight': None, 'model__penalty': 'l2', 'model__solver': 'saga'}
Best ROC-AUC:  0.8385
Best Accuracy:0.7984
Best F1:      0.7915

MODEL: dt
Best Params: {'model__class_weight': None, 'model__max_depth': 4, 'model__min_samples_leaf': 8, 'model__min_samples_split': 4}
Best ROC-AUC:  0.8231
Best Accuracy:0.7865
Best F1:      0.7710

MODEL: rf
Best Params: {'model__class_weight': None, 'model__max_depth': 7, 'model__min_samples_split': 12, 'model__n_estimators': 187}
Best ROC-AUC:  0.8385
Best Accuracy:0.7973
Best F1:      0.7849

MODEL: svm
Best Params: {'model__C': np.float64(0.16949768237884735), 'model__class_weight': None, 'model__kernel': 'linear'}
Best ROC-AUC:  0.8339
Best Accuracy:0.7964
Best F1:      0.7900
[LightGBM] [Info] Number of positive: 1496, number of negative: 4138
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000998 seconds.
You can set `forc

PermutationExplainer explainer: 51it [00:18,  1.30it/s]                        


SVC SHAP analysis complete.




LGBMClassifier SHAP analysis complete.


In [15]:
summary = pd.DataFrame({
    "model": results_dict.keys(),
    "mean_stability": [results_dict[m]["mean_stability"] for m in results_dict.keys()],
    "std_stability": [results_dict[m]["std_stability"] for m in results_dict.keys()],
    "accuracy": [results_dict[m]["metrics"]["accuracy"] for m in results_dict.keys()],
    "f1_weighted": [results_dict[m]["metrics"]["f1_weighted"] for m in results_dict.keys()],
    "roc_auc": [results_dict[m]["metrics"]["roc_auc"] for m in results_dict.keys()],
    "ranking_score": [results_dict[m]["ranking_score"] for m in results_dict.keys()],
})

# Sort by mean stability descending
summary = summary.sort_values("ranking_score", ascending=False).reset_index(drop=True)
print("\n===== Model Summary and Scoring =====")
print(summary)


===== Model Summary and Scoring =====
    model  mean_stability  std_stability  accuracy  f1_weighted   roc_auc  \
0      rf        0.998815       0.001302  0.807665     0.796814  0.862650   
1  logreg        0.999782       0.000254  0.820440     0.815416  0.860708   
2     lgb        0.997309       0.001805  0.803407     0.786120  0.860744   
3     svm        0.999796       0.000317  0.819730     0.816144  0.856374   
4      dt        0.997209       0.004990  0.794890     0.782467  0.837847   

   ranking_score  
0       0.930733  
1       0.930245  
2       0.929026  
3       0.928085  
4       0.917528  


### Try on a multiclass dataset

Wine Quality Dataset

https://archive.ics.uci.edu/dataset/186/wine+quality 

In [46]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
wine_quality = fetch_ucirepo(id=186) 

le = LabelEncoder()

# data (as pandas dataframes) 
X_wine = wine_quality.data.features 
y_wine = le.fit_transform(wine_quality.data.targets)
  
# metadata 
print(wine_quality.metadata) 
  
# variable information 
print(wine_quality.variables) 


{'uci_id': 186, 'name': 'Wine Quality', 'repository_url': 'https://archive.ics.uci.edu/dataset/186/wine+quality', 'data_url': 'https://archive.ics.uci.edu/static/public/186/data.csv', 'abstract': 'Two datasets are included, related to red and white vinho verde wine samples, from the north of Portugal. The goal is to model wine quality based on physicochemical tests (see [Cortez et al., 2009], http://www3.dsi.uminho.pt/pcortez/wine/).', 'area': 'Business', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 4898, 'num_features': 11, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['quality'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Wed Nov 15 2023', 'dataset_doi': '10.24432/C56S3T', 'creators': ['Paulo Cortez', 'A. Cerdeira', 'F. Almeida', 'T. Matos', 'J. Reis'], 'intro_paper': {'ID': 252, 'type': 'NATIVE', 'title': 'Modeling wine preferences

  y = column_or_1d(y, warn=True)


In [47]:
X_wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [53]:
# Split the data into training and test sets

X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(X_wine, y_wine, test_size=0.2, random_state=42, stratify=y_wine)

X_train_wine.shape, y_train_wine.shape, X_test_wine.shape, y_test_wine.shape

((5197, 11), (5197,), (1300, 11), (1300,))

In [54]:
type(y_train_wine)

numpy.ndarray

In [55]:
type(y_test_wine)

numpy.ndarray

In [60]:
results = run_randomized_search(
    models=models,
    param_grids=param_grids,
    X_train=X_train_wine, ## change
    y_train=y_train_wine, ## change
    preprocessor=preprocessor,
    feature_grouper_cls=GroupFeatures,
    feature_grouper_kwargs={"max_unique": 20},
    n_iter=5,
    n_splits=4,
    scoring=None,
    refit_metric="roc_auc",
    random_state=42,
    n_jobs=-1,
    verbose=True,
)

results_dict = {}

for name, search in results.items():

    best_pipeline = search.best_estimator_
    grouper = best_pipeline.named_steps["features"]

    numerical_columns = grouper.numeric_columns_
    categorical_columns = grouper.categorical_columns_
    keep_columns = grouper.keep_columns_

    metrics, y_pred, y_proba = evaluate_model_on_test(
        best_pipeline=best_pipeline,
        X_test=X_test_wine, ##change
        y_test=y_test_wine) ## change

    stability = shap_stability_perturbations(
        best_pipeline,
        X_train_wine, ## Change
        X_test_wine, ## Change
        numerical_columns=numerical_columns,
        n_background=30,
        n_samples=50,
        n_perturb=10,
        noise_level=0.02,   
        random_state=42
        )
    
    mean_stability = stability.mean()

    ranking_score = multi_objective_score(performance=metrics['roc_auc'], SHAP_stab=mean_stability)

    results_dict[name] = {
        "metrics": metrics,
        "y_pred": y_pred,
        "y_proba": y_proba,
        "stability": stability,
        "mean_stability": mean_stability,
        "std_stability": stability.std(),
        "ranking_score": ranking_score
        }




MODEL: logreg
Best Params: {'model__C': np.float64(6.015308718396457), 'model__class_weight': None, 'model__penalty': 'l1', 'model__solver': 'saga'}
Best ROC-AUC:  0.7755
Best Accuracy:0.5419
Best F1:      0.5081

MODEL: dt
Best Params: {'model__class_weight': None, 'model__max_depth': 7, 'model__min_samples_leaf': 19, 'model__min_samples_split': 12}
Best ROC-AUC:  0.7221
Best Accuracy:0.5357
Best F1:      0.5161

MODEL: rf
Best Params: {'model__class_weight': None, 'model__max_depth': 7, 'model__min_samples_split': 12, 'model__n_estimators': 187}
Best ROC-AUC:  0.8059
Best Accuracy:0.5751
Best F1:      0.5390

MODEL: svm
Best Params: {'model__C': np.float64(0.0051800516442430215), 'model__class_weight': None, 'model__gamma': 'auto', 'model__kernel': 'rbf'}
Best ROC-AUC:  0.7632
Best Accuracy:0.4366
Best F1:      0.2654
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001196 seconds.
You can set `force_row_wise=true` to remove the overhead.
And i

PermutationExplainer explainer: 51it [00:18,  1.23it/s]                        


RandomForestClassifier SHAP analysis complete.
Unique classes in y_test: [0 1 2 3 4 5 6]
Shape of y_proba: (1300, 7)
Classes learned by model: [0 1 2 3 4 5 6]


PermutationExplainer explainer: 51it [04:12,  5.14s/it]                        


SVC SHAP analysis complete.
Unique classes in y_test: [0 1 2 3 4 5 6]
Shape of y_proba: (1300, 7)
Classes learned by model: [0 1 2 3 4 5 6]


PermutationExplainer explainer: 51it [01:32,  2.05s/it]                        


LGBMClassifier SHAP analysis complete.


In [62]:
summary_wine = pd.DataFrame({
    "model": results_dict.keys(),
    "mean_stability": [results_dict[m]["mean_stability"] for m in results_dict.keys()],
    "std_stability": [results_dict[m]["std_stability"] for m in results_dict.keys()],
    "accuracy": [results_dict[m]["metrics"]["accuracy"] for m in results_dict.keys()],
    "f1_weighted": [results_dict[m]["metrics"]["f1_weighted"] for m in results_dict.keys()],
    "roc_auc": [results_dict[m]["metrics"]["roc_auc"] for m in results_dict.keys()],
    "ranking_score": [results_dict[m]["ranking_score"] for m in results_dict.keys()],
})

# Sort by mean stability descending
summary_wine = summary_wine.sort_values("ranking_score", ascending=False).reset_index(drop=True)
print("\n===== Model Summary and Scoring =====")
print(summary_wine)


===== Model Summary and Scoring =====
    model  mean_stability  std_stability  accuracy  f1_weighted   roc_auc  \
0      rf        0.996413       0.003241  0.583846     0.553150  0.787636   
1     lgb        0.944810       0.026613  0.671538     0.664859  0.836045   
2     svm        0.997311       0.001820  0.436154     0.264916  0.770371   
3  logreg        0.995185       0.004101  0.534615     0.506378  0.763161   
4      dt        0.984431       0.021470  0.536154     0.518726  0.734694   

   ranking_score  
0       0.892025  
1       0.890427  
2       0.883841  
3       0.879173  
4       0.859563  


In [63]:
print(summary)

    model  mean_stability  std_stability  accuracy  f1_weighted   roc_auc  \
0      rf        0.998815       0.001302  0.807665     0.796814  0.862650   
1  logreg        0.999782       0.000254  0.820440     0.815416  0.860708   
2     lgb        0.997309       0.001805  0.803407     0.786120  0.860744   
3     svm        0.999796       0.000317  0.819730     0.816144  0.856374   
4      dt        0.997209       0.004990  0.794890     0.782467  0.837847   

   ranking_score  
0       0.930733  
1       0.930245  
2       0.929026  
3       0.928085  
4       0.917528  
