The `NestedOptimizer` should take any estimator, optimizer, inner and outer cv methods and return the tuning results. 

In [3]:
import lightgbm as lgb
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import RFECV
from mlops_class import FeatureSelector, MLUtils
import pandas as pd
import numpy as np

In [4]:
class NestedCVOptimizer:

    def __init__(self, estimator=lgb.LGBMClassifier(verbose=-1),
                cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                task='feature_selection',
                optimizer=None,scorer=None):
        self.estimator = estimator
        self.cv = cv
        self.task=task
        self.optimizer = optimizer
        self.scorer = scorer
    
    
    def optimize(self, X, y):
        """Optimize based on the CV Optimizer configerations

        Args:
            X (df): the features
            y (array): the target

        Returns:
            best_set: best_feature_set for FeatureSelector, best_hyperms_set for HypermsTuner
        """

        if self.task == 'feature_selection':
            #TODO: more code can be shared between FeatureSelector and HypermsTuner

            selected_features_all_folds = []
            outer_loop_log = {}
            test_scores = []

            for i, (train_idx, test_idx) in enumerate(cv.split(X,y)):
                X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                y_train, y_test = y[train_idx], y[test_idx]
                
                # Perform feature selection using the inner cross-validation
                selected_features, inner_score = self.optimizer.select_features(X_train, y_train)
                # Train the estimator on the selected features
                self.estimator.fit(X_train[selected_features], y_train)
                # Evaluate the estimator on the test set
                score = self.scorer(self.estimator, X_test[selected_features], y_test)
                
                outer_loop_log[f'fold_{i}'] = {}
                outer_loop_log[f'fold_{i}']['test_score'] = score
                outer_loop_log[f'fold_{i}']['inner_cv_score'] = inner_score
                outer_loop_log[f'fold_{i}']['best_features'] = selected_features
                test_scores.append(score)
                selected_features_all_folds.append(selected_features)

            best_feature_set = MLUtils.get_best_fold(outer_loop_log, 'test_score', 'best_features')
            self.loop_log = outer_loop_log
            self.best_set = best_feature_set
            self.score_best = max(test_scores)
            self.score_std = np.std(test_scores)

        return self

In [5]:
# Example usage
# Create a sample dataset for demonstration
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns = data.feature_names)
y = data.target

# Define the estimator
estimator=lgb.LGBMClassifier(verbose=-1)
# Define outer and inner cross-validation strategies
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Define the scorer
custom_scorer = make_scorer(MLUtils.gini_scorer, greater_is_better=True, needs_proba=True)
# Initialize the FeatureSelector with an estimator, scoring and cv option
feature_selector = FeatureSelector(estimator=estimator, 
                                    min_features_to_select=2,
                                    step = 1,
                                    scorer = custom_scorer,
                                    cv = cv)

# Perform nested cross-validation with above configuration
custom_optimizer = NestedCVOptimizer(estimator = estimator, 
                                cv=cv,
                                task='feature_selection',
                                optimizer = feature_selector,
                                scorer = custom_scorer
                              )

best_features = custom_optimizer.optimize(X,y)
print(best_features)
# best_features = nested_cv_feature_selection(X, y, estimator, feature_selector, 
#                                                             scorer = custom_scorer,
#                                                             cv_outer = cv_outer)

The best score is 0.9980347199475925, and the std is 0.013116190392169654
['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'radius error', 'texture error', 'perimeter error', 'area error', 'smoothness error', 'compactness error', 'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension']
