In [19]:
import pandas as pd
import numpy as np
from numpy import sqrt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error
import time
from collections import defaultdict

class NestedCV:

    def __init__(self, innercv: int = 10, outercv: int = 10, refit: str = 'MSE', random_state: bool = 42):
        self.innercv = innercv
        self.outercv = outercv
        self.refit = refit # Refit the grid
        self.random_state = random_state
        self.best_hp_list = []


    def __repr__(self):
        return f"NestedCV(inner loops: {self.innercv}, outer loops: {self.outercv})"

    def fit(self, X: pd.DataFrame, y: pd.DataFrame, pipeline: Pipeline,
            grid_param: dict, trace: bool = True, njobs: bool = False):
        
        X = np.array(X)
        y = np.ravel(y)

        scoring_metrics = {
            'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
            'MSE': make_scorer(mean_squared_error, greater_is_better=False)
        }
        self.nested_cv_scores = defaultdict(list)

        inner_cv = KFold(n_splits=self.innercv, shuffle=True, random_state=self.random_state)
        outer_cv = KFold(n_splits=self.outercv, shuffle=True, random_state=self.random_state)

        for i, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):
            outer_start_time = time.time()
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            grid_search_params = {'cv': inner_cv,
                                  'scoring': scoring_metrics,
                                  'refit': self.refit,
                                  'return_train_score': True,
                                  'n_jobs': -1 if njobs else None}
            inner_cv_search = GridSearchCV(estimator=pipeline,
                                           param_grid=grid_param,
                                           **grid_search_params)
            inner_cv_search.fit(X_train, y_train)
            self.best_params = inner_cv_search.best_params_
            self.best_hp_list.append(self.best_params)

            y_pred = inner_cv_search.predict(X_test)

            for metric in scoring_metrics.keys():
                if metric == 'MSE':
                    score = mean_squared_error(y_test, y_pred)
                    metric_name = 'RMSE'
                    score = sqrt(score)  # Convert MSE to RMSE
                else:
                    score = mean_absolute_error(y_test, y_pred)
                    metric_name = metric

                self.nested_cv_scores[metric_name].append(score)

                if trace:
                    print(f"Outer fold {i+1} {metric_name}: {score:.3f}")

            if trace:
                print(f"Best hyperparameters: {self.best_params}")
                percentage_done = (i + 1) / outer_cv.n_splits * 100
                print(f"{percentage_done:.2f}% of the procedure is complete\n")

            outer_end_time = time.time()
            outer_time_elapsed = outer_end_time - outer_start_time
            if trace:
                mins, secs = divmod(outer_time_elapsed, 60)
                outer_time = f"{int(mins)} min and {secs:.2f} sec"
                print(f"Time taken for outer-fold-{i+1}: {outer_time}\n")

        self.mean_outer_cv_scores = {metric: np.mean(scores) for metric, scores in self.nested_cv_scores.items()}

        last_step_name = list(pipeline.named_steps.keys())[-1]
        self.model_name = pipeline.named_steps[last_step_name].__class__.__name__
        self.pipe = pipeline
        self.params = grid_param

    def best_hp(self):
        best_hp_df = pd.DataFrame(self.best_hp_list)
        best_hp_df.index = [f"Outer Fold {i+1}" for i in range(self.outercv)]
        return best_hp_df

    def performance(self):
        mean_scores_df = pd.DataFrame(self.mean_outer_cv_scores,
                                      index=[f'\'{self.model_name}\' NestedCV Performance']).T
        return mean_scores_df


In [20]:
data = pd.read_csv("interactions.csv") # import some data
y = data.iloc[:,:1]
X = data.iloc[:, 1:]

In [21]:
from sklearn.svm import SVR

# SVR pipeline
SVR_pipe = Pipeline([
    ('Scaler', StandardScaler()),
    ('regressor', SVR(gamma='scale'))
    # You can also add a feature selection algortihm here (performs FS for each fold).
])

# HP to tune
SVR_grid_param = {
    'regressor__C': [1, 10, 50],
    'regressor__kernel': ['linear', 'rbf', 'poly'],
    'regressor__epsilon': [1,]
}

NestedCV_SVR = NestedCV(innercv=5, outercv=5, random_state=5666)
NestedCV_SVR.fit(X, y,
                SVR_pipe,
                SVR_grid_param,
                njobs=True) # Activate parallel computing

Outer fold 1 MAE: 3.520
Outer fold 1 RMSE: 4.618
Best hyperparameters: {'regressor__C': 1, 'regressor__epsilon': 1, 'regressor__kernel': 'linear'}
20.00% of the procedure is complete

Time taken for outer-fold-1: 0 min and 9.20 sec

Outer fold 2 MAE: 3.518
Outer fold 2 RMSE: 4.841
Best hyperparameters: {'regressor__C': 50, 'regressor__epsilon': 1, 'regressor__kernel': 'linear'}
40.00% of the procedure is complete

Time taken for outer-fold-2: 0 min and 13.01 sec

Outer fold 3 MAE: 3.248
Outer fold 3 RMSE: 4.050
Best hyperparameters: {'regressor__C': 50, 'regressor__epsilon': 1, 'regressor__kernel': 'linear'}
60.00% of the procedure is complete

Time taken for outer-fold-3: 0 min and 15.87 sec

Outer fold 4 MAE: 3.898
Outer fold 4 RMSE: 5.201
Best hyperparameters: {'regressor__C': 1, 'regressor__epsilon': 1, 'regressor__kernel': 'linear'}
80.00% of the procedure is complete

Time taken for outer-fold-4: 0 min and 8.41 sec

Outer fold 5 MAE: 3.600
Outer fold 5 RMSE: 4.589
Best hyperparam

In [22]:
NestedCV_SVR.performance()

Unnamed: 0,'SVR' NestedCV Performance
MAE,3.556709
RMSE,4.659937


In [10]:
NestedCV_SVR.best_hp()

Unnamed: 0,regressor__C,regressor__epsilon,regressor__kernel
Outer Fold 1,1,1,linear
Outer Fold 2,50,1,linear
Outer Fold 3,50,1,linear
Outer Fold 4,1,1,linear
Outer Fold 5,10,1,linear


In [29]:
from sklearn.ensemble import RandomForestRegressor

RF_pipe = Pipeline([
    ('Scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(bootstrap=True,
                                          random_state=42))
    ])

RF_grid_param = {
    'regressor__n_estimators': (100, 1000),
    'regressor__max_features': ('log2', 'sqrt'),
    'regressor__criterion': ('squared_error', 'absolute_error', 'friedman_mse')
    }

NestedCV_RF = NestedCV(innercv=5, outercv=5)

NestedCV_RF.fit(X, y,
                RF_pipe,
                RF_grid_param,
                njobs=True)

Outer fold 1 MAE: 4.007
Outer fold 1 RMSE: 4.960
Best hyperparameters: {'regressor__criterion': 'squared_error', 'regressor__max_features': 'sqrt', 'regressor__n_estimators': 1000}
20.00% of the procedure is complete

Time taken for outer-fold-1: 0 min and 26.47 sec

Outer fold 2 MAE: 3.858
Outer fold 2 RMSE: 4.699
Best hyperparameters: {'regressor__criterion': 'squared_error', 'regressor__max_features': 'sqrt', 'regressor__n_estimators': 1000}
40.00% of the procedure is complete

Time taken for outer-fold-2: 0 min and 31.09 sec

Outer fold 3 MAE: 4.131
Outer fold 3 RMSE: 5.395
Best hyperparameters: {'regressor__criterion': 'squared_error', 'regressor__max_features': 'sqrt', 'regressor__n_estimators': 1000}
60.00% of the procedure is complete

Time taken for outer-fold-3: 0 min and 31.09 sec

Outer fold 4 MAE: 4.067
Outer fold 4 RMSE: 5.164
Best hyperparameters: {'regressor__criterion': 'friedman_mse', 'regressor__max_features': 'sqrt', 'regressor__n_estimators': 1000}
80.00% of the pr

In [30]:
NestedCV_RF.performance()

Unnamed: 0,'RandomForestRegressor' NestedCV Performance
MAE,3.972274
RMSE,4.990225


In [31]:
NestedCV_RF.best_hp()

Unnamed: 0,regressor__criterion,regressor__max_features,regressor__n_estimators
Outer Fold 1,squared_error,sqrt,1000
Outer Fold 2,squared_error,sqrt,1000
Outer Fold 3,squared_error,sqrt,1000
Outer Fold 4,friedman_mse,sqrt,1000
Outer Fold 5,friedman_mse,sqrt,1000


----

# $\textit{This class works with every regressor from}$ $\color{orange}{\texttt{scikit-learn}}$ $library$

---