In [1]:
import pandas as pd
import numpy as np
import rpy2
import rpy2.robjects as robjects
from sklearn.model_selection import train_test_split
import warnings
import time
warnings.filterwarnings('ignore')

### Import some data

In [2]:
data = pd.read_csv("data.csv").drop(['id', 'Unnamed: 32'], axis='columns')
data['diagnosis'] = data['diagnosis'].map({'M':1, 'B':0}) 
data = data.rename(columns={'diagnosis': 'y'})

#Python ignores 'dots' (we rename them in order to use stepAICc from R)
data = data.rename(columns={'concave points_worst': 'concave.points_worst',
                            'concave points_se': 'concave.points_se',
                            'concave points_mean': 'concave.points_mean'})
X = data.loc[:, data.columns != 'y']
y = data[['y']]

In [3]:
from rpy2.robjects import pandas2ri
import rpy2.robjects as ro

# Activate the automatic conversion
# between pandas and R data frames
pandas2ri.activate()

# Convert the pandas DataFrame to an R data.frame
r_data_frame = pandas2ri.py2rpy(data)

# Check the R data.frame
#print(r_data_frame)

# If you want to use the R data.frame in R functions
ro.r.assign('R_DF', r_data_frame)

y,radius_mean,texture_mean,...,concave.points_worst,symmetry_worst,fractal_dimension_worst
...,...,...,...,...,...,...


In [4]:
ro.r('R_DF')

Unnamed: 0,y,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave.points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave.points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,1,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,1,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,1,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,1,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,1,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [5]:
### Get access to StepAICc from R

# StepAICc does stepwise feature selection using
# Corrected Akaike Information Criterion 
# (exact code for that can be found on my thesis chapter 3)
# We could use any feature selection algorithm (and then convert it
# into scikit-learn class type)

r = robjects.r
source = r['source']("get_stepAICc.R")

In [76]:
%%time
stepAICc_backward = list(r['stepAICc_coef'](ro.r('R_DF')))

CPU times: user 59.6 s, sys: 12.7 s, total: 1min 12s
Wall time: 9.65 s


In [6]:
%%time
stepAICc_forward = list(r['stepAICc_coef'](ro.r('R_DF'),
                                          direction='forward'))

CPU times: user 2.36 s, sys: 66.6 ms, total: 2.43 s
Wall time: 2.44 s


In [8]:
%%time
stepAICc_both = list(r['stepAICc_coef'](ro.r('R_DF'),
                                          direction='both'))

CPU times: user 27 s, sys: 5.58 s, total: 32.6 s
Wall time: 7.12 s


In [77]:
#Backward stepAICc
print(len(list(stepAICc_backward)))
stepAICc_backward

24


['radius_mean',
 'texture_mean',
 'area_mean',
 'smoothness_mean',
 'compactness_mean',
 'concavity_mean',
 'concave.points_mean',
 'symmetry_mean',
 'fractal_dimension_mean',
 'perimeter_se',
 'area_se',
 'smoothness_se',
 'compactness_se',
 'concavity_se',
 'concave.points_se',
 'symmetry_se',
 'fractal_dimension_se',
 'radius_worst',
 'texture_worst',
 'perimeter_worst',
 'area_worst',
 'concavity_worst',
 'symmetry_worst',
 'fractal_dimension_worst']

In [10]:
#Forward stepAICc
print(len(stepAICc_forward))
list(stepAICc_forward)

10


['perimeter_worst',
 'smoothness_worst',
 'texture_worst',
 'radius_se',
 'symmetry_worst',
 'compactness_se',
 'concavity_mean',
 'texture_se',
 'area_se',
 'concave.points_worst']

In [11]:
#Backward stepAICc
print(len(list(stepAICc_both)))
stepAICc_both

11


['radius_mean',
 'perimeter_mean',
 'compactness_mean',
 'concave.points_mean',
 'area_se',
 'radius_worst',
 'concavity_worst',
 'texture_worst',
 'symmetry_worst',
 'symmetry_se',
 'smoothness_worst']

--------

1. Load the dataset and define the classifier, parameter grid, and cross-validators (inner and outer).

2. Split the data using the outer cross-validator into training and test sets for each fold.

3. For each fold in the outer cross-validator:\
$\phantom{......}\mathbf i$. Perform hyperparameter tuning using $\texttt{GridSearchCV}$ on the training set from the outer cross-validator. This step involves the inner cross-$\phantom{........}$validator for model selection.\
$\phantom{......}\mathbf {ii}$. Evaluate the best model found by the inner cross-validator on the test set from the outer cross-validator. The test set from the outer cross-$\phantom{.........}$validator is unseen by the inner cross-validator during hyperparameter tuning.
4. Calculate the average score of the nested cross-validation.

-----

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.utils import resample #Bootstrapping
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
                                     cross_val_score, StratifiedKFold)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import (roc_auc_score, f1_score, precision_score,
                            recall_score, average_precision_score,
                            balanced_accuracy_score, matthews_corrcoef,
                            make_scorer, get_scorer)

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

pandas2ri.activate()

class StepAICc(BaseEstimator, TransformerMixin):
    
    def __init__(self, direction: str = "backward"):
        self.direction = direction

    def fit(self, X: pd.DataFrame, y: pd.DataFrame):
        data = pd.concat([X, y], axis=1)

        r_data_frame = pandas2ri.py2rpy(data)
        # If you want to use the R data.frame in R functions
        ro.r.assign('R_DF', r_data_frame)
        self.feature_indices_ = list(r['stepAICc_coef'](ro.r('R_DF'),
                                         direction=self.direction))
        return self

    def transform(self, X):
        self.featuresDF = X[self.feature_indices_]
        return self.featuresDF
    
    def get_feature_names_out(self):
        return self.featuresDF.columns.tolist()

'average_precision_score' needed a custom scorer with probabilities is that it specifically deals with the ranking of predictions by their probability scores.

The other metrics, such as F1, precision, and recall, work directly with the predicted class labels, so there's no need to handle probabilities explicitly for them. These metrics are calculated based on the confusion matrix derived from the true and predicted class labels. As a result, the default implementation of make_scorer works well for them without any additional customization.

In [9]:
        
class NestedCV:
    """
    This implementation of Nested Cross-Validation was
    developed by Ioannis Maris in 2023.
    It is compatible with Python 3.11. This class is capable of
    performing Nested CV
    for any Scikit-learn classifier, including Random Forest,
    Support Vector Classifier (SVC),
    Logistic Regression, and more. Note that to use it for
    regression purposes,
    you only need to change the evaluation metrics to options
    such as
    Mean Squared Error (MSE), Mean Absolute Error (MAE), or R-
    squared (R2). 
    """
    import sys
    import os
    import warnings
    if not sys.warnoptions:
        warnings.simplefilter("ignore")
        os.environ["PYTHONWARNINGS"] = "ignore"
    
    def __init__(self, innercv: int = 10, outercv: int = 10):
        self.innercv = innercv
        self.outercv = outercv
        
    def __repr__(self):
        return f"NestedCV(inner loops: {self.innercv}, outer loops: {self.outercv})"
    
    def fit(self,
            X: pd.DataFrame,
            y: pd.DataFrame,
            pipeline: Pipeline,
            grid_param: dict,
            trace: bool = True,
            njobs: bool = False):
        
        response = y.columns[0]; col = list(X.columns)
        arr2df = lambda X: pd.DataFrame(X, columns=col)
        arr2vec = lambda y: pd.DataFrame(y, columns=[response])
        X = np.array(X); y = np.ravel(y) #'revectorize' them again
        
        custom_average_precision_score = lambda y_true, y_pred, pos_label=None, needs_proba=True:\
                 average_precision_score(y_true, y_pred, pos_label=pos_label)

        average_precision_scorer = make_scorer(custom_average_precision_score,
                                               pos_label=1, needs_proba=True)
        
        scoring_metrics = {
            'roc_auc': 'roc_auc',
            'F1': make_scorer(f1_score),
            'F1_macro': 'f1_macro',
            'precision': make_scorer(precision_score),
            'recall': make_scorer(recall_score),
            'average_precision': average_precision_scorer,
            'balanced_accuracy': make_scorer(balanced_accuracy_score),
            'accuracy': 'accuracy',
            'matthews_corrcoef': make_scorer(matthews_corrcoef),
        }
        self.nested_cv_scores = []
        best_params_counts = defaultdict(int)

        inner_cv = StratifiedKFold(n_splits=self.innercv, shuffle=True,
                         random_state=5666)
        outer_cv = StratifiedKFold(n_splits=self.outercv, shuffle=True,
                         random_state=5666)

        n_splits = inner_cv.n_splits
        best_indices = []; self.results_dict = {}; self.best_hp_list = []
        
        outer_cv_scores = {metric: [] for metric in scoring_metrics.keys()}
            
        #Nested CV loop
        
        for i, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):
            # Start counting outer-loop time
            outer_start_time = time.time()
            
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            if not njobs:
                inner_cv_search = GridSearchCV(estimator=pipeline,
                                               param_grid=grid_param,
                                               cv=inner_cv,
                                               scoring=scoring_metrics,
                                               refit='roc_auc',
                                               return_train_score=True)
            else:
                inner_cv_search = GridSearchCV(estimator=pipeline,
                                               param_grid=grid_param,
                                               cv=inner_cv,
                                               scoring=scoring_metrics,
                                               refit='roc_auc',
                                               return_train_score=True,
                                               n_jobs=-1)
                
            percentage_done = (i+1) / outer_cv.n_splits * 100
            if trace:
                print(f"\rInner CV training & hyperparameter tuning on outer fold {i+1} ...")

            inner_cv_search.fit(arr2df(X_train), arr2vec(y_train))

            y_test_prob = inner_cv_search.predict_proba(arr2df(X_test))[:, 1]  
            outer_cv_score = roc_auc_score(y_test, y_test_prob)

            cv_results = inner_cv_search.cv_results_
            best_indices.append(cv_results["rank_test_roc_auc"].argmin())
            
            if trace:
                print('\n', 69*"_", '\n')
                print(f"-> Outer fold {i+1} results:\n")

            outer_fold_results = []

            for metric in scoring_metrics.keys():
                # the performance of the best model for each inner cross-validation fold
                test_scores = [cv_results[f"split{split}_test_{metric}"][
                       best_indices[-1]] for split in range(inner_cv.get_n_splits())]
                outer_fold_results.append(test_scores)
                if trace:
                    print(f"Inner {metric} scores: {np.array(test_scores).round(4)}")
                    print(f"mean-inner-{metric} : {np.array(test_scores).mean():.3f}\n\n")
            
            
            # Create a dataframe for the current outer fold
            outer_fold_df = pd.DataFrame(outer_fold_results,
                              columns=[f"Inner Fold {j+1}" for j in range(self.innercv)],
                              index=scoring_metrics.keys())

            metrics_mean_values = outer_fold_df.mean(axis=1)
            outer_fold_df['Mean value'] = metrics_mean_values
            self.results_dict[f"outer_fold{i + 1}"] = outer_fold_df
            
            if trace:
                print(f"\n-> Outer fold {i+1} mean roc_auc: {outer_cv_score:.3f}\n")

            self.nested_cv_scores.append(outer_cv_score)

            self.best_params = inner_cv_search.best_params_
            self.best_hp_list.append(self.best_params)
            best_params_hashable = tuple(sorted(self.best_params.items()))
            best_params_counts[best_params_hashable] += 1
            
            if trace:
                print(f"Inner Fold {i+1} best hyperparameters :\n\n{self.best_params}")
                print("\n\n", 16*'-',
                      f"{percentage_done:.2f}% of the procedure is complete",
                      16*'-','\n')
            
            current_outer_fold_scores = {}
            for metric in scoring_metrics.keys():
                # Get the corresponding scorer object or string
                scorer = scoring_metrics[metric]
                ## Check if the scorer has 'needs_proba' set to True
                if hasattr(scorer, '_kwargs') and scorer._kwargs.get('needs_proba', False):
                    y_test_score = y_test_prob
                    
                else:
                    y_test_pred = inner_cv_search.predict(arr2df(X_test))
                    y_test_score = y_test_pred
                    
                 # Convert metric string to its corresponding scorer object
                if isinstance(scorer, str):
                    scorer = get_scorer(scorer)
                    
                # Calculate the outer fold score using the scorer and the test data
                outer_fold_score = scorer(inner_cv_search, arr2df(X_test), arr2vec(y_test))
                current_outer_fold_scores[metric] = outer_fold_score

            for metric in scoring_metrics.keys():
                outer_cv_scores[metric].append(current_outer_fold_scores[metric])
                
            outer_end_time = time.time()
            outer_time_elapsed = outer_end_time - outer_start_time
            if trace:
                mins, secs = divmod(outer_time_elapsed, 60)
                outer_time = f"{int(mins)} min. and {secs:.2f}" if int(mins)!=0 else f"{secs:.2f}"
                print(f"Time taken for outer-fold-{i+1}: ",
                      outer_time, "sec.\n")
                
                
        mean_outer_cv_scores = {metric: np.mean(scores) for metric, scores in outer_cv_scores.items()}
        self.mean_outer_cv_scores = mean_outer_cv_scores
        
        #Save the name of the cls for later on...
        last_step_name = list(pipeline.named_steps.keys())[-1]
        self.classifier_name = pipeline.named_steps[last_step_name].__class__.__name__
        #save the pipeline, params for later on (fit final model)...
        self.pipe = pipeline
        self.params = grid_param
        
    def mean_roc_auc(self, Format: int = 6): #Nested CV mean roc auc
        # Calculate the average score of the nested cross-validation
        nested_cv_average_score = np.mean(self.nested_cv_scores)
        return round(nested_cv_average_score, Format)
    
    def inner_scores(self, outer_fold=1):
        return self.results_dict["outer_fold"+str(outer_fold)]
        
    def best_hp(self):
        # Create a dataframe for the best hyperparameters for each outer fold
        best_hp_df = pd.DataFrame(self.best_hp_list,
                                  columns=self.best_params.keys())
        best_hp_df.columns = pd.MultiIndex.from_tuples([('Best Hyperparameters',
                                                         col) for col in best_hp_df.columns])
        best_hp_df.index = [f"Outer Fold {i+1}" for i in range(self.outercv)]
        best_hp_df.columns = pd.MultiIndex.from_tuples([('Best Hyperparameters',
                                      col[1].split('__')[-1]) for col in best_hp_df.columns])
        return best_hp_df
    
    def performance(self):
        #Mean NestedCV performance for each metric score
        mean_scores_dict = self.mean_outer_cv_scores
        mean_scores_df = pd.DataFrame(mean_scores_dict,
                                      index=[f'\'{self.classifier_name}\' NestedCV Performance']).T
        return mean_scores_df
    
    def most_frequent_hp(self):
        best_params_counts = defaultdict(int)

        # Count the occurrences of each set of best hyperparameters
        for params in self.best_hp_list:
            best_params_hashable = tuple(sorted(params.items()))
            best_params_counts[best_params_hashable] += 1

        # Find the most frequent set of best hyperparameters
        most_frequent_best_params = max(best_params_counts, key=best_params_counts.get)
        most_frequent_best_params = dict(most_frequent_best_params)

        return most_frequent_best_params
    
    ## Fit the final model using ALL data. 'Best' is based on most frequent HP (on outer folds) ##
    def fit_best_model(self, X: pd.DataFrame, y: pd.DataFrame):
        most_frequent_best_params = self.most_frequent_hp()

        # Set the most frequent hyperparameters to the pipeline
        for param_name, param_value in most_frequent_best_params.items():
            self.pipe.set_params(**{param_name: param_value})
            
        self.pipe.fit(X, y)

        return self.pipe
    
    def total_models_fitted(self):
        # Calculate the number of combinations in the param_grid
        combinations = 1
        for param_values in self.params.values():
            combinations *= len(param_values)
        
        # Multiply the number of combinations by the inner CV and outer CV loops
        total_models = combinations * self.innercv * self.outercv

        # Add 1 for the final model fitted with the entire dataset
        total_models += 1

        return total_models

# $\bullet$ NestedCV: $\texttt{Random Forest}$

In [14]:
%%time

## demo  example of the class ##


RF_pipe = Pipeline([
    ('Scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(bootstrap=True,
                                          random_state=69))
    ])


RF_grid_param = {
    
    'classifier__n_estimators': (100, 150),
    'classifier__criterion': ('gini', 'entropy'),
    'classifier__max_features': ('log2', 'sqrt')
    }

NestedCV_RF = NestedCV(innercv=5, outercv=5)


NestedCV_RF.fit(X, y,
                RF_pipe,
                RF_grid_param,
                njobs=True)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.985  0.9995 0.9858 0.9946 0.9877]
mean-inner-roc_auc : 0.991


Inner F1 scores: [0.9394 0.9538 0.9143 0.9275 0.9375]
mean-inner-F1 : 0.935


Inner F1_macro scores: [0.9525 0.9641 0.9304 0.9416 0.9518]
mean-inner-F1_macro : 0.948


Inner precision scores: [0.9688 1.     0.8889 0.9143 0.9677]
mean-inner-precision : 0.948


Inner recall scores: [0.9118 0.9118 0.9412 0.9412 0.9091]
mean-inner-recall : 0.923


Inner average_precision scores: [0.982  0.9992 0.9839 0.9916 0.9827]
mean-inner-average_precision : 0.988


Inner balanced_accuracy scores: [0.9471 0.9559 0.9355 0.9443 0.9459]
mean-inner-balanced_accuracy : 0.946


Inner accuracy scores: [0.956  0.967  0.9341 0.9451 0.956 ]
mean-inner-accuracy : 0.952


Inner matthews_corrcoef scores: [0.906  0.9307 0.8617 0.8835 0.9047]
mean-inner-matthews_corrcoef 

In [15]:
NestedCV_RF.performance()

Unnamed: 0,'RandomForestClassifier' NestedCV Performance
roc_auc,0.989649
F1,0.952143
F1_macro,0.96215
precision,0.961639
recall,0.943189
average_precision,0.987166
balanced_accuracy,0.960386
accuracy,0.964804
matthews_corrcoef,0.924641


In [16]:
NestedCV_RF.inner_scores(outer_fold=1)

Unnamed: 0,Inner Fold 1,Inner Fold 2,Inner Fold 3,Inner Fold 4,Inner Fold 5,Mean value
roc_auc,0.985036,0.999484,0.98581,0.994582,0.987722,0.990527
F1,0.939394,0.953846,0.914286,0.927536,0.9375,0.934512
F1_macro,0.952456,0.964103,0.930357,0.941644,0.951801,0.948072
precision,0.96875,1.0,0.888889,0.914286,0.967742,0.947933
recall,0.911765,0.911765,0.941176,0.941176,0.909091,0.922995
average_precision,0.982045,0.99916,0.98392,0.991636,0.982728,0.987898
balanced_accuracy,0.94711,0.955882,0.935501,0.944272,0.945925,0.945738
accuracy,0.956044,0.967033,0.934066,0.945055,0.956044,0.951648
matthews_corrcoef,0.905984,0.930686,0.861714,0.883544,0.904703,0.897326


In [17]:
NestedCV_RF.best_hp()

Unnamed: 0_level_0,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters
Unnamed: 0_level_1,criterion,max_features,n_estimators
Outer Fold 1,entropy,sqrt,100
Outer Fold 2,entropy,sqrt,150
Outer Fold 3,entropy,sqrt,150
Outer Fold 4,entropy,sqrt,150
Outer Fold 5,entropy,log2,100


In [18]:
final_model = NestedCV_RF.fit_best_model(X, y)
#Ready to do predictions on future, unseen, data
final_model

### $\to$ Lets add more Hyperparameters:

In [293]:
%%time

### nestedcv-RF using ALL predictors ###

RF_pipe = Pipeline([
    ('Scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(bootstrap=True,
                                          random_state=69))
    ])


RF_grid_param = {
    'classifier__n_estimators': (100, 150, 250, 450),
    'classifier__criterion': ('gini', 'entropy'),
    'classifier__min_samples_split': (2, 3, 5),
    'classifier__min_samples_leaf': (1, 2, 4),
    'classifier__max_features': ('sqrt', 'log2')
    }

# Including additional hyperparameters results in a substantial increase
## in the time required to fit each combination. Thats why one could use
### 'RandomizedSearchCV' instead of GridSearchCV.

NestedCV_RF_full = NestedCV(innercv=10, outercv=10)


NestedCV_RF_full.fit(X, y,
                     RF_pipe,
                     RF_grid_param)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.9793 0.9968 0.977  1.     0.9984 0.9901 1.     0.9984 0.9967 0.9786]
mean-inner-roc_auc : 0.992


Inner F1 scores: [0.8571 0.9744 0.973  0.9744 0.9744 0.8889 0.9444 0.973  0.9231 0.9189]
mean-inner-F1 : 0.940


Inner F1_macro scores: [0.8923 0.9795 0.9788 0.9792 0.9792 0.9141 0.9571 0.9788 0.9377 0.9364]
mean-inner-F1_macro : 0.953


Inner precision scores: [0.9375 0.95   1.     0.95   0.95   0.9412 1.     1.     0.9    0.9444]
mean-inner-precision : 0.957


Inner recall scores: [0.7895 1.     0.9474 1.     1.     0.8421 0.8947 0.9474 0.9474 0.8947]
mean-inner-recall : 0.926


Inner average_precision scores: [0.9682 0.9946 0.9777 1.     0.9974 0.9837 1.     0.9974 0.995  0.9735]
mean-inner-average_precision : 0.989


Inner balanced_accuracy scores: [0.8796 0.9848 0.9737 0.9844 0.9844 0.9054 0.9474 0.97

In [294]:
NestedCV_RF_full.mean_roc_auc()

0.990975

In [295]:
NestedCV_RF_full.performance()

Unnamed: 0,'RandomForestClassifier' NestedCV Performance
roc_auc,0.990975
F1,0.944181
F1_macro,0.956195
precision,0.961739
recall,0.928788
average_precision,0.988124
balanced_accuracy,0.953203
accuracy,0.959524
matthews_corrcoef,0.913697


In [301]:
NestedCV_RF_full.inner_scores(outer_fold=1).round(3)

Unnamed: 0,Inner Fold 1,Inner Fold 2,Inner Fold 3,Inner Fold 4,Inner Fold 5,Inner Fold 6,Inner Fold 7,Inner Fold 8,Inner Fold 9,Inner Fold 10,Mean value
roc_auc,0.979,0.997,0.977,1.0,0.998,0.99,1.0,0.998,0.997,0.979,0.992
F1,0.857,0.974,0.973,0.974,0.974,0.889,0.944,0.973,0.923,0.919,0.94
F1_macro,0.892,0.979,0.979,0.979,0.979,0.914,0.957,0.979,0.938,0.936,0.953
precision,0.938,0.95,1.0,0.95,0.95,0.941,1.0,1.0,0.9,0.944,0.957
recall,0.789,1.0,0.947,1.0,1.0,0.842,0.895,0.947,0.947,0.895,0.926
average_precision,0.968,0.995,0.978,1.0,0.997,0.984,1.0,0.997,0.995,0.974,0.989
balanced_accuracy,0.88,0.985,0.974,0.984,0.984,0.905,0.947,0.974,0.942,0.932,0.951
accuracy,0.904,0.981,0.98,0.98,0.98,0.922,0.961,0.98,0.941,0.941,0.957
matthews_corrcoef,0.792,0.96,0.958,0.959,0.959,0.832,0.918,0.958,0.876,0.874,0.909


In [297]:
NestedCV_RF_full.best_hp()

Unnamed: 0_level_0,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters
Unnamed: 0_level_1,criterion,max_features,min_samples_leaf,min_samples_split,n_estimators
Outer Fold 1,entropy,log2,2,5,250
Outer Fold 2,entropy,log2,1,5,100
Outer Fold 3,entropy,log2,1,2,150
Outer Fold 4,entropy,log2,2,2,100
Outer Fold 5,entropy,sqrt,2,5,250
Outer Fold 6,entropy,log2,2,5,150
Outer Fold 7,entropy,log2,1,5,150
Outer Fold 8,entropy,sqrt,2,5,100
Outer Fold 9,entropy,log2,1,3,150
Outer Fold 10,entropy,log2,1,5,450


In [298]:
NestedCV_RF_full.total_models_fitted()

14401

# $\to\texttt{ StepAICc-'filter'}$ inside the cross-validation 

In [215]:
%%time

### Use StepAICc 'filter' inside the cross-validation ###

RF_stepAICc_pipe = Pipeline([
    ('StepAICc', StepAICc()),
    ('Scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(bootstrap=True,
                                          criterion='entropy',
                                          random_state=69))
    ])

RF_stepAICc_grid_param = {
    'StepAICc__direction': ('forward', 'backward', 'both'),
    'classifier__n_estimators': (100, 250),
    'classifier__min_samples_split': (2, 5),
    'classifier__min_samples_leaf': (1, 3),
    'classifier__max_features': ('sqrt', 'log2')
    }

NestedCV_RF_stepAICc = NestedCV(innercv=10, outercv=10)

NestedCV_RF_stepAICc.fit(X, y,
           RF_stepAICc_pipe,
           RF_stepAICc_grid_param)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.9679 0.9907]
mean-inner-roc_auc : 0.979


Inner F1 scores: [0.8687 0.9245]
mean-inner-F1 : 0.897


Inner F1_macro scores: [0.8992 0.9398]
mean-inner-F1_macro : 0.920


Inner precision scores: [0.9348 0.9245]
mean-inner-precision : 0.930


Inner recall scores: [0.8113 0.9245]
mean-inner-recall : 0.868


Inner average_precision scores: [0.9576 0.9857]
mean-inner-average_precision : 0.972


Inner balanced_accuracy scores: [0.8888 0.9398]
mean-inner-balanced_accuracy : 0.914


Inner accuracy scores: [0.9085 0.9437]
mean-inner-accuracy : 0.926


Inner matthews_corrcoef scores: [0.8037 0.8796]
mean-inner-matthews_corrcoef : 0.842



-> Outer fold 1 mean roc_auc: 0.995

Inner Fold 1 best hyperparameters :

{'StepAICc__direction': 'both'}


 ---------------- 50.00% of the procedure is complete ----------------

In [302]:
NestedCV_RF_stepAICc.mean_roc_auc()

0.985732

In [305]:
NestedCV_RF_stepAICc.performance()

Unnamed: 0,'RandomForestClassifier' NestedCV Performance
roc_auc,0.985732
F1,0.940462
F1_macro,0.952806
precision,0.947151
recall,0.933962
average_precision,0.983182
balanced_accuracy,0.951563
accuracy,0.956035
matthews_corrcoef,0.905732


In [303]:
NestedCV_RF_stepAICc.inner_scores()

Unnamed: 0,Inner Fold 1,Inner Fold 2,Mean value
roc_auc,0.967882,0.990672,0.979277
F1,0.868687,0.924528,0.896608
F1_macro,0.899208,0.939792,0.9195
precision,0.934783,0.924528,0.929655
recall,0.811321,0.924528,0.867925
average_precision,0.957649,0.985715,0.971682
balanced_accuracy,0.888806,0.939792,0.914299
accuracy,0.908451,0.943662,0.926056
matthews_corrcoef,0.803678,0.879584,0.841631


In [304]:
NestedCV_RF_stepAICc.best_hp()

Unnamed: 0_level_0,Best Hyperparameters
Unnamed: 0_level_1,direction
Outer Fold 1,both
Outer Fold 2,forward


# $\bullet$ NestedCV: $\texttt{Penalized Logistic Regression}$

In [231]:
%%time

from sklearn.linear_model import LogisticRegression

    
LR_pipe = Pipeline([
    ('Scaler', StandardScaler()),
    ('classifier', LogisticRegression(penalty='elasticnet',
                                      solver='saga',
                                      random_state=69,
                                      max_iter=200,
                                      n_jobs=-1)) 
    ])                          # n_jobs: parallel backend


LR_param_grid = {
    'classifier__C': [1/0.05, 1/0.01, 1, 1/1.5]+
                     (np.linspace(2, 50, 30)**-1).tolist(),
    'classifier__l1_ratio': np.linspace(0.1,1,10).round(1)
}


NestedCV_LR = NestedCV(innercv=10, outercv=10)


NestedCV_LR.fit(X, y,
               LR_pipe,
               LR_param_grid,
               njobs=True)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.9888 0.9984 0.9984 1.     1.     0.9951 1.     1.     1.     0.972 ]
mean-inner-roc_auc : 0.995


Inner F1 scores: [0.9444 0.95   0.973  0.973  0.95   0.9444 1.     1.     0.973  0.9444]
mean-inner-F1 : 0.965


Inner F1_macro scores: [0.9575 0.9594 0.9788 0.9788 0.9589 0.9571 1.     1.     0.9788 0.9571]
mean-inner-F1_macro : 0.973


Inner precision scores: [1.     0.9048 1.     1.     0.9048 1.     1.     1.     1.     1.    ]
mean-inner-precision : 0.981


Inner recall scores: [0.8947 1.     0.9474 0.9474 1.     0.8947 1.     1.     0.9474 0.8947]
mean-inner-recall : 0.953


Inner average_precision scores: [0.9858 0.9974 0.9974 1.     1.     0.9928 1.     1.     1.     0.969 ]
mean-inner-average_precision : 0.994


Inner balanced_accuracy scores: [0.9474 0.9697 0.9737 0.9737 0.9688 0.9474 1.     1.  

In [280]:
NestedCV_LR.inner_scores(outer_fold=10).round(4)

Unnamed: 0,Inner Fold 1,Inner Fold 2,Inner Fold 3,Inner Fold 4,Inner Fold 5,Inner Fold 6,Inner Fold 7,Inner Fold 8,Inner Fold 9,Inner Fold 10,Mean value
roc_auc,1.0,1.0,1.0,0.9918,0.9984,1.0,0.9984,0.9918,1.0,0.9605,0.9941
F1,1.0,0.9744,0.973,0.973,0.973,0.9744,0.9143,0.9474,0.9744,0.9444,0.9648
F1_macro,1.0,0.9795,0.979,0.9788,0.9788,0.9792,0.9348,0.9581,0.9792,0.9571,0.9724
precision,1.0,0.95,1.0,1.0,1.0,0.95,1.0,0.9474,0.95,1.0,0.9797
recall,1.0,1.0,0.9474,0.9474,0.9474,1.0,0.8421,0.9474,1.0,0.8947,0.9526
average_precision,1.0,1.0,1.0,0.989,0.9974,1.0,0.9974,0.9881,1.0,0.9604,0.9932
balanced_accuracy,1.0,0.9848,0.9737,0.9737,0.9737,0.9844,0.9211,0.9581,0.9844,0.9474,0.9701
accuracy,1.0,0.9808,0.9808,0.9804,0.9804,0.9804,0.9412,0.9608,0.9804,0.9608,0.9746
matthews_corrcoef,1.0,0.9598,0.9589,0.9585,0.9585,0.9593,0.8775,0.9161,0.9593,0.9177,0.9466


In [238]:
NestedCV_LR.mean_roc_auc().round(3)

0.994

In [284]:
NestedCV_LR.performance()

Unnamed: 0,'LogisticRegression' NestedCV Performance
roc_auc,0.994444
F1,0.968138
F1_macro,0.97518
precision,0.985627
recall,0.952381
average_precision,0.99343
balanced_accuracy,0.971984
accuracy,0.977193
matthews_corrcoef,0.95141


In [292]:
NestedCV_LR.fit_best_model(X,y)

In [288]:
NestedCV_LR.best_hp()

Unnamed: 0_level_0,Best Hyperparameters,Best Hyperparameters
Unnamed: 0_level_1,C,l1_ratio
Outer Fold 1,1.0,0.1
Outer Fold 2,1.0,0.1
Outer Fold 3,1.0,0.2
Outer Fold 4,0.143564,0.1
Outer Fold 5,0.5,0.9
Outer Fold 6,0.5,0.7
Outer Fold 7,0.143564,0.1
Outer Fold 8,0.273585,0.1
Outer Fold 9,0.097315,0.2
Outer Fold 10,1.0,0.1


In [252]:
%%time
    
LR_pipe_step = Pipeline([
    ('Scaler', StandardScaler()),
    ('classifier', LogisticRegression(penalty='elasticnet',
                                      solver='saga',
                                      random_state=69,
                                      max_iter=200,
                                      n_jobs=-1))
    ])


LR_param_grid_step = {
    'classifier__C': [1/0.05, 1/0.01, 1, 1/1.5]+
                     (np.linspace(2, 50, 30)**-1).tolist(),
    'classifier__l1_ratio': np.linspace(0.1,1,10).round(1)
}


NestedCV_LR_step = NestedCV(innercv=10, outercv=10)


NestedCV_LR_step.fit(data[stepAICc_backward], y,
               LR_pipe,
               LR_param_grid,
               njobs=True)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.992  0.9984 0.9984 0.9951 1.     0.9967 1.     1.     1.     0.9753]
mean-inner-roc_auc : 0.996


Inner F1 scores: [0.9444 0.95   0.973  0.973  0.95   0.9444 1.     0.973  1.     0.9444]
mean-inner-F1 : 0.965


Inner F1_macro scores: [0.9575 0.9594 0.9788 0.9788 0.9589 0.9571 1.     0.9788 1.     0.9571]
mean-inner-F1_macro : 0.973


Inner precision scores: [1.     0.9048 1.     1.     0.9048 1.     1.     1.     1.     1.    ]
mean-inner-precision : 0.981


Inner recall scores: [0.8947 1.     0.9474 0.9474 1.     0.8947 1.     0.9474 1.     0.8947]
mean-inner-recall : 0.953


Inner average_precision scores: [0.989  0.9974 0.9974 0.9928 1.     0.995  1.     1.     1.     0.9721]
mean-inner-average_precision : 0.994


Inner balanced_accuracy scores: [0.9474 0.9697 0.9737 0.9737 0.9688 0.9474 1.     0.97

In [257]:
NestedCV_LR_step.performance().round(3)

Unnamed: 0,'LogisticRegression' NestedCV Performance
roc_auc,0.995
F1,0.966
F1_macro,0.973
precision,0.981
recall,0.952
average_precision,0.994
balanced_accuracy,0.971
accuracy,0.975
matthews_corrcoef,0.948


In [289]:
NestedCV_LR_step.total_models_fitted()

34001

In [259]:
NestedCV_LR_step.best_hp().round(4)

Unnamed: 0_level_0,Best Hyperparameters,Best Hyperparameters
Unnamed: 0_level_1,C,l1_ratio
Outer Fold 1,20.0,0.1
Outer Fold 2,20.0,0.6
Outer Fold 3,0.6667,0.1
Outer Fold 4,0.6667,0.2
Outer Fold 5,1.0,0.3
Outer Fold 6,0.6667,0.1
Outer Fold 7,20.0,0.1
Outer Fold 8,0.6667,0.1
Outer Fold 9,0.2736,0.1
Outer Fold 10,1.0,0.1


# $\bullet$ NestedCV: $\texttt{SVC}$


In [80]:
%%time

from sklearn.svm import SVC


from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning



SVC_stepAICc_pipe = Pipeline([
   # ('StepAICc', StepAICc()),
    ('Scaler', StandardScaler()),
    ('classifier', SVC(random_state=69,
                       probability=True))
    ])


SVC_param_grid = {
    'classifier__C': [0.05, 1, 2, 3, 4, 5],
    'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'classifier__degree': [1, 2, 3],
    'classifier__gamma': ['scale', 'auto'] + [0.1, 1, 10],
    }


NestedCV_SVC = NestedCV(innercv=10, outercv=10)


NestedCV_SVC.fit(X, y,
           SVC_stepAICc_pipe,
           SVC_param_grid,
           njobs=True)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.9952 1.     0.9967 1.     1.     1.     1.     0.9934 1.     0.9819]
mean-inner-roc_auc : 0.997


Inner F1 scores: [0.9444 0.9268 0.9444 1.     0.973  0.9744 1.     0.973  1.     0.9444]
mean-inner-F1 : 0.968


Inner F1_macro scores: [0.9575 0.9396 0.9571 1.     0.9788 0.9792 1.     0.9788 1.     0.9571]
mean-inner-F1_macro : 0.975


Inner precision scores: [1.     0.8636 1.     1.     1.     0.95   1.     1.     1.     1.    ]
mean-inner-precision : 0.981


Inner recall scores: [0.8947 1.     0.8947 1.     0.9474 1.     1.     0.9474 1.     0.8947]
mean-inner-recall : 0.958


Inner average_precision scores: [0.9928 1.     0.9946 1.     1.     1.     1.     0.9908 1.     0.9778]
mean-inner-average_precision : 0.996


Inner balanced_accuracy scores: [0.9474 0.9545 0.9474 1.     0.9737 0.9844 1.     0.97

In [81]:
NestedCV_SVC.mean_roc_auc() #0.99444

0.994444

In [82]:
NestedCV_SVC.performance()

Unnamed: 0,'SVC' NestedCV Performance
roc_auc,0.994444
F1,0.970699
F1_macro,0.977127
precision,0.985627
recall,0.957143
average_precision,0.993359
balanced_accuracy,0.974365
accuracy,0.978947
matthews_corrcoef,0.95509


In [83]:
NestedCV_SVC.best_hp()

Unnamed: 0_level_0,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters
Unnamed: 0_level_1,C,degree,gamma,kernel
Outer Fold 1,3,1,scale,rbf
Outer Fold 2,3,1,scale,rbf
Outer Fold 3,5,1,scale,rbf
Outer Fold 4,4,1,scale,rbf
Outer Fold 5,2,1,scale,rbf
Outer Fold 6,4,1,scale,rbf
Outer Fold 7,2,1,scale,poly
Outer Fold 8,2,1,scale,rbf
Outer Fold 9,2,1,scale,rbf
Outer Fold 10,2,1,scale,rbf


# $\bullet$ K-NeighborsClassifier

In [310]:
from sklearn.neighbors import KNeighborsClassifier as knn


knn_pipe = Pipeline([
   # ('StepAICc', StepAICc()),
    ('Scaler', StandardScaler()),
    ('classifier', knn())
    ])


param_grid_knn = {
    'classifier__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15], 
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan', 'minkowski'],
    'classifier__p': [1, 2, 3] # Only used when metric is 'minkowski'
}

NestedCV_knn = NestedCV(innercv=10, outercv=10)


NestedCV_knn.fit(X, y,
           knn_pipe,
           param_grid_knn,
           njobs=True)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.9841 0.9952 0.9934 1.     0.9885 0.9852 1.     0.9951 1.     0.9704]
mean-inner-roc_auc : 0.991


Inner F1 scores: [0.8485 0.9474 0.9444 0.9744 0.973  0.8889 1.     0.973  1.     0.9444]
mean-inner-F1 : 0.949


Inner F1_macro scores: [0.889  0.9585 0.9571 0.9792 0.9788 0.9141 1.     0.9788 1.     0.9571]
mean-inner-F1_macro : 0.961


Inner precision scores: [1.     0.9474 1.     0.95   1.     0.9412 1.     1.     1.     1.    ]
mean-inner-precision : 0.984


Inner recall scores: [0.7368 0.9474 0.8947 1.     0.9474 0.8421 1.     0.9474 1.     0.8947]
mean-inner-recall : 0.921


Inner average_precision scores: [0.9791 0.9928 0.9908 1.     0.9858 0.9773 1.     0.9928 1.     0.967 ]
mean-inner-average_precision : 0.989


Inner balanced_accuracy scores: [0.8684 0.9585 0.9474 0.9844 0.9737 0.9054 1.     0.97

In [318]:
NestedCV_knn.performance()

Unnamed: 0,'KNeighborsClassifier' NestedCV Performance
roc_auc,0.991601
F1,0.95717
F1_macro,0.967124
precision,0.994444
recall,0.92381
average_precision,0.988595
balanced_accuracy,0.960516
accuracy,0.970144
matthews_corrcoef,0.936571


In [316]:
NestedCV_knn.best_hp()

Unnamed: 0_level_0,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters
Unnamed: 0_level_1,metric,n_neighbors,p,weights
Outer Fold 1,euclidean,9,1,distance
Outer Fold 2,euclidean,15,1,distance
Outer Fold 3,euclidean,15,1,distance
Outer Fold 4,euclidean,13,1,distance
Outer Fold 5,euclidean,15,1,distance
Outer Fold 6,euclidean,9,1,distance
Outer Fold 7,euclidean,15,1,distance
Outer Fold 8,euclidean,9,1,distance
Outer Fold 9,euclidean,13,1,distance
Outer Fold 10,euclidean,9,1,distance


In [315]:
NestedCV_knn.inner_scores(outer_fold=10).round(3)

Unnamed: 0,Inner Fold 1,Inner Fold 2,Inner Fold 3,Inner Fold 4,Inner Fold 5,Inner Fold 6,Inner Fold 7,Inner Fold 8,Inner Fold 9,Inner Fold 10,Mean value
roc_auc,1.0,1.0,0.994,0.997,0.993,1.0,0.993,0.992,1.0,0.963,0.993
F1,1.0,1.0,0.944,0.919,0.944,1.0,0.944,0.914,1.0,0.919,0.959
F1_macro,1.0,1.0,0.958,0.936,0.957,1.0,0.957,0.935,1.0,0.936,0.968
precision,1.0,1.0,1.0,0.944,1.0,1.0,1.0,1.0,1.0,0.944,0.989
recall,1.0,1.0,0.895,0.895,0.895,1.0,0.895,0.842,1.0,0.895,0.932
average_precision,1.0,1.0,0.99,0.995,0.99,1.0,0.99,0.988,1.0,0.961,0.991
balanced_accuracy,1.0,1.0,0.947,0.932,0.947,1.0,0.947,0.921,1.0,0.932,0.963
accuracy,1.0,1.0,0.962,0.941,0.961,1.0,0.961,0.941,1.0,0.941,0.971
matthews_corrcoef,1.0,1.0,0.918,0.874,0.918,1.0,0.918,0.877,1.0,0.874,0.938


# Decision Trees

In [322]:
from sklearn.tree import DecisionTreeClassifier

DT_pipe = Pipeline([
   # ('StepAICc', StepAICc()),
    ('Scaler', StandardScaler()),
    ('classifier', DecisionTreeClassifier())
    ])


param_grid = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [None, 3, 5, 10],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 5, 10],
    'classifier__max_features': [None, 'sqrt', 'log2'],
}


NestedCV_DT = NestedCV(innercv=10, outercv=10)


NestedCV_DT.fit(X, y,
           DT_pipe,
           param_grid,
           njobs=True)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.9394 0.9745 0.9581 1.     0.991  0.9901 0.9622 0.9433 1.     0.9597]
mean-inner-roc_auc : 0.972


Inner F1 scores: [0.8235 0.8947 0.8421 0.9143 0.9474 0.8649 0.8947 0.8649 1.     0.8889]
mean-inner-F1 : 0.894


Inner F1_macro scores: [0.8689 0.9171 0.8742 0.9348 0.9581 0.894  0.9161 0.894  1.     0.9141]
mean-inner-F1_macro : 0.917


Inner precision scores: [0.9333 0.8947 0.8421 1.     0.9474 0.8889 0.8947 0.8889 1.     0.9412]
mean-inner-precision : 0.923


Inner recall scores: [0.7368 0.8947 0.8421 0.8421 0.9474 0.8421 0.8947 0.8421 1.     0.8421]
mean-inner-recall : 0.868


Inner average_precision scores: [0.9063 0.9428 0.9495 1.     0.9833 0.9845 0.9506 0.8845 1.     0.9464]
mean-inner-average_precision : 0.955


Inner balanced_accuracy scores: [0.8533 0.9171 0.8742 0.9211 0.9581 0.8898 0.9161 0.88

In [323]:
NestedCV_DT.performance()

Unnamed: 0,'DecisionTreeClassifier' NestedCV Performance
roc_auc,0.955596
F1,0.904919
F1_macro,0.925603
precision,0.928737
recall,0.88658
average_precision,0.931706
balanced_accuracy,0.922298
accuracy,0.931485
matthews_corrcoef,0.854422


In [324]:
NestedCV_DT.best_hp()

Unnamed: 0_level_0,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters
Unnamed: 0_level_1,criterion,max_depth,max_features,min_samples_leaf,min_samples_split
Outer Fold 1,gini,10.0,log2,10,10
Outer Fold 2,gini,5.0,,10,10
Outer Fold 3,gini,5.0,log2,5,2
Outer Fold 4,gini,5.0,log2,10,10
Outer Fold 5,entropy,,sqrt,10,5
Outer Fold 6,gini,,log2,10,5
Outer Fold 7,gini,3.0,sqrt,10,10
Outer Fold 8,gini,10.0,log2,10,10
Outer Fold 9,entropy,3.0,sqrt,10,5
Outer Fold 10,gini,10.0,log2,10,2


In [325]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

NB_pipe = Pipeline([
   # ('StepAICc', StepAICc()),
    ('Scaler', StandardScaler()),
    ('classifier', BernoulliNB())
    ])


param_grid = {
    'classifier__alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0],
    'classifier__binarize': [0.0, 0.1, 0.2, 0.5, 0.8, 1.0],
    'classifier__fit_prior': [True, False],
    'classifier__class_prior': [None, 'balanced']
}


NestedCV_NB = NestedCV(innercv=10, outercv=10)


NestedCV_NB.fit(X, y,
           NB_pipe,
           param_grid,
           njobs=True)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.9426 0.9936 0.9671 0.9786 0.9951 0.9901 0.9984 1.     0.9967 0.9457]
mean-inner-roc_auc : 0.981


Inner F1 scores: [0.7143 0.8837 0.9189 0.8947 0.9744 0.9143 0.9744 1.     0.95   0.8889]
mean-inner-F1 : 0.911


Inner F1_macro scores: [0.7604 0.9009 0.9364 0.9161 0.9792 0.9348 0.9792 1.     0.9589 0.9141]
mean-inner-F1_macro : 0.928


Inner precision scores: [0.6522 0.7917 0.9444 0.8947 0.95   1.     0.95   1.     0.9048 0.9412]
mean-inner-precision : 0.903


Inner recall scores: [0.7895 1.     0.8947 0.8947 1.     0.8421 1.     1.     1.     0.8421]
mean-inner-recall : 0.926


Inner average_precision scores: [0.9192 0.9908 0.9664 0.9593 0.9917 0.9853 0.9974 1.     0.995  0.949 ]
mean-inner-average_precision : 0.975


Inner balanced_accuracy scores: [0.7735 0.9242 0.9317 0.9161 0.9844 0.9211 0.9844 1.  

In [326]:
NestedCV_NB.performance()

Unnamed: 0,'BernoulliNB' NestedCV Performance
roc_auc,0.982037
F1,0.897521
F1_macro,0.917632
precision,0.893929
recall,0.905628
average_precision,0.975956
balanced_accuracy,0.919123
accuracy,0.922713
matthews_corrcoef,0.838207


In [327]:
NestedCV_NB.best_hp()

Unnamed: 0_level_0,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters
Unnamed: 0_level_1,alpha,binarize,class_prior,fit_prior
Outer Fold 1,0.001,0.0,,True
Outer Fold 2,0.001,0.2,,True
Outer Fold 3,0.001,0.1,,True
Outer Fold 4,0.001,0.1,,True
Outer Fold 5,0.001,0.1,,True
Outer Fold 6,0.001,0.1,,True
Outer Fold 7,10.0,0.1,,True
Outer Fold 8,0.1,0.0,,True
Outer Fold 9,10.0,0.1,,True
Outer Fold 10,10.0,0.0,,True


# Gradient Boosting Machines (GBM)


# AdaBoost


GradientBoostingClassifier is an ensemble learning method in scikit-learn that builds an additive model in a forward stage-wise manner. It combines multiple weak learners, typically decision trees, to create a strong learner by sequentially training the trees to focus on the misclassified examples or residuals from the previous trees in the sequence.

Gradient Boosting works by fitting an initial model (typically a decision tree) to the data, then fitting additional models to the residuals of the initial model, updating the residuals after each iteration. By doing this, the algorithm learns to correct the errors made by previous models, thus improving the overall model performance. This process continues until a specified number of models have been added or the residual errors can no longer be reduced.

In [59]:
%%time

# Gradient Boosting Machines (GBM)
from sklearn.ensemble import GradientBoostingClassifier

GB_pipe = Pipeline([
    ('Scaler', StandardScaler()),
    ('classifier', GradientBoostingClassifier(
                                        random_state=5666))
    ])

param_grid = {
    'classifier__loss': ['deviance', 'exponential'],
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.001, 0.01],
    'classifier__max_depth': [5, 7],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2, 5],
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__subsample': [0.5, 0.75, 1.0],
}

NestedCV_GB = NestedCV(innercv=10, outercv=10)


NestedCV_GB.fit(X, y, GB_pipe,
                param_grid, njobs=True)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.9745 0.9968 0.9786 0.9984 1.     0.9934 0.9951 0.9967 0.9984 0.9803]
mean-inner-roc_auc : 0.991


Inner F1 scores: [0.8571 0.9744 0.9143 0.9744 0.9744 0.8571 0.9444 0.9143 0.9474 0.9189]
mean-inner-F1 : 0.928


Inner F1_macro scores: [0.8923 0.9795 0.9348 0.9792 0.9792 0.8913 0.9571 0.9348 0.9581 0.9364]
mean-inner-F1_macro : 0.944


Inner precision scores: [0.9375 0.95   1.     0.95   0.95   0.9375 1.     1.     0.9474 0.9444]
mean-inner-precision : 0.962


Inner recall scores: [0.7895 1.     0.8421 1.     1.     0.7895 0.8947 0.8421 0.9474 0.8947]
mean-inner-recall : 0.900


Inner average_precision scores: [0.9595 0.9946 0.9786 0.9974 1.     0.9886 0.9922 0.9946 0.9974 0.9744]
mean-inner-average_precision : 0.988


Inner balanced_accuracy scores: [0.8796 0.9848 0.9211 0.9844 0.9844 0.8791 0.9474 0.92

In [60]:
NestedCV_GB.performance()

Unnamed: 0,'GradientBoostingClassifier' NestedCV Performance
roc_auc,0.989909
F1,0.655312
F1_macro,0.780649
precision,0.680375
recall,0.63355
average_precision,0.987826
balanced_accuracy,0.811219
accuracy,0.855482
matthews_corrcoef,0.632829


In [61]:
NestedCV_GB.best_hp()

Unnamed: 0_level_0,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters
Unnamed: 0_level_1,learning_rate,loss,max_depth,max_features,min_samples_leaf,min_samples_split,n_estimators,subsample
Outer Fold 1,0.01,deviance,7,log2,2,2,100,0.5
Outer Fold 2,0.001,exponential,7,sqrt,2,2,100,0.75
Outer Fold 3,0.01,exponential,7,log2,2,2,100,0.5
Outer Fold 4,0.001,deviance,5,log2,2,5,200,0.75
Outer Fold 5,0.01,exponential,5,sqrt,1,2,100,0.5
Outer Fold 6,0.01,deviance,7,log2,5,2,200,0.75
Outer Fold 7,0.01,deviance,7,sqrt,1,5,200,1.0
Outer Fold 8,0.01,deviance,7,log2,1,2,200,0.75
Outer Fold 9,0.01,exponential,5,log2,2,2,200,0.75
Outer Fold 10,0.001,deviance,7,log2,1,2,100,0.5


AdaBoostClassifier (Adaptive Boosting Classifier) is an ensemble learning method in scikit-learn that combines multiple weak learners, typically decision trees, to create a strong learner. The idea behind AdaBoost is to train weak classifiers sequentially and then combine their predictions using a weighted majority vote. The weights for each classifier are adjusted based on their performance, with more accurate classifiers getting higher weights.

The algorithm starts by training a weak classifier on the data and assigning equal weights to each training example. After training the initial classifier, the weights of the misclassified examples are increased. This makes the next classifier in the sequence focus more on the misclassified examples, attempting to correct the errors made by the previous classifier. This process continues for a specified number of iterations, and the final model is a weighted combination of all the weak classifiers.


In [37]:
%%time

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier


AB_pipe = Pipeline([
    ('Scaler', StandardScaler()),
    ('classifier', AdaBoostClassifier(random_state=69))
    ])


param_grid = {
    'classifier__n_estimators': [800, 1000, 1300, 1500, 2000],
    'classifier__learning_rate': [0.005, 0.01],
    'classifier__algorithm': ['SAMME', 'SAMME.R'],
}


NestedCV_AB = NestedCV(innercv=10, outercv=10)


NestedCV_AB.fit(X, y, AB_pipe, param_grid,
                njobs=True)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.9745 0.9952 0.9951 1.     0.9951 0.9885 1.     0.9967 1.     0.9688]
mean-inner-roc_auc : 0.991


Inner F1 scores: [0.9143 0.95   0.9474 1.     0.973  0.9189 1.     0.9474 1.     0.9444]
mean-inner-F1 : 0.960


Inner F1_macro scores: [0.9354 0.9594 0.9581 1.     0.9788 0.9364 1.     0.9581 1.     0.9571]
mean-inner-F1_macro : 0.968


Inner precision scores: [1.     0.9048 0.9474 1.     1.     0.9444 1.     0.9474 1.     1.    ]
mean-inner-precision : 0.974


Inner recall scores: [0.8421 1.     0.9474 1.     0.9474 0.8947 1.     0.9474 1.     0.8947]
mean-inner-recall : 0.947


Inner average_precision scores: [0.9642 0.9922 0.9928 1.     0.9928 0.9815 1.     0.995  1.     0.9672]
mean-inner-average_precision : 0.989


Inner balanced_accuracy scores: [0.9211 0.9697 0.9581 1.     0.9737 0.9317 1.     0.95

In [74]:
pd.concat([NestedCV_AB.performance(),
           NestedCV_GB.performance()],
           axis=1).round(5)

Unnamed: 0,'AdaBoostClassifier' NestedCV Performance,'GradientBoostingClassifier' NestedCV Performance
roc_auc,0.98739,0.98991
F1,0.94827,0.65531
F1_macro,0.95975,0.78065
precision,0.97497,0.68038
recall,0.92403,0.63355
average_precision,0.98475,0.98783
balanced_accuracy,0.95503,0.81122
accuracy,0.96303,0.85548
matthews_corrcoef,0.92097,0.63283


In [57]:
NestedCV_AB.inner_scores(outer_fold=6).round(3)

Unnamed: 0,Inner Fold 1,Inner Fold 2,Inner Fold 3,Inner Fold 4,Inner Fold 5,Inner Fold 6,Inner Fold 7,Inner Fold 8,Inner Fold 9,Inner Fold 10,Mean value
roc_auc,0.952,1.0,0.967,0.977,1.0,0.995,1.0,1.0,1.0,0.98,0.987
F1,0.842,0.944,0.973,0.914,1.0,0.973,0.927,0.973,1.0,0.973,0.952
F1_macro,0.876,0.958,0.979,0.935,1.0,0.979,0.939,0.979,1.0,0.979,0.962
precision,0.889,1.0,1.0,1.0,1.0,1.0,0.864,1.0,1.0,1.0,0.975
recall,0.8,0.895,0.947,0.842,1.0,0.947,1.0,0.947,1.0,0.947,0.933
average_precision,0.933,1.0,0.973,0.976,1.0,0.993,1.0,1.0,1.0,0.98,0.985
balanced_accuracy,0.869,0.947,0.974,0.921,1.0,0.974,0.953,0.974,1.0,0.974,0.959
accuracy,0.885,0.962,0.98,0.941,1.0,0.98,0.941,0.98,1.0,0.98,0.965
matthews_corrcoef,0.754,0.918,0.958,0.877,1.0,0.958,0.885,0.958,1.0,0.958,0.927


In [44]:
NestedCV_AB.best_hp()

Unnamed: 0_level_0,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters
Unnamed: 0_level_1,algorithm,learning_rate,n_estimators
Outer Fold 1,SAMME.R,0.01,2000
Outer Fold 2,SAMME.R,0.01,2000
Outer Fold 3,SAMME.R,0.01,2000
Outer Fold 4,SAMME.R,0.01,1500
Outer Fold 5,SAMME.R,0.01,2000
Outer Fold 6,SAMME.R,0.01,2000
Outer Fold 7,SAMME.R,0.01,2000
Outer Fold 8,SAMME.R,0.01,2000
Outer Fold 9,SAMME.R,0.01,2000
Outer Fold 10,SAMME.R,0.01,2000


In [347]:
# Quadratic Discriminant Analysis (QDA)
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


QDA_pipe = Pipeline([
   # ('StepAICc', StepAICc()),
    ('Scaler', StandardScaler()),
    ('classifier', QuadraticDiscriminantAnalysis())
    ])


param_grid = {
    'classifier__reg_param': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}



NestedCV_QDA = NestedCV(innercv=10, outercv=10)


NestedCV_QDA.fit(X, y,
           QDA_pipe,
           param_grid,
           njobs=True)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.9872 0.9984 0.9951 0.9984 0.9967 0.9918 1.     0.9967 1.     0.9803]
mean-inner-roc_auc : 0.994


Inner F1 scores: [0.8824 0.9744 0.9444 0.9744 0.9444 0.9474 0.973  0.9189 1.     0.9444]
mean-inner-F1 : 0.950


Inner F1_macro scores: [0.9126 0.9795 0.9571 0.9792 0.9571 0.9581 0.9788 0.9364 1.     0.9571]
mean-inner-F1_macro : 0.962


Inner precision scores: [1.     0.95   1.     0.95   1.     0.9474 1.     0.9444 1.     1.    ]
mean-inner-precision : 0.979


Inner recall scores: [0.7895 1.     0.8947 1.     0.8947 0.9474 0.9474 0.8947 1.     0.8947]
mean-inner-recall : 0.926


Inner average_precision scores: [0.9821 0.9974 0.9928 0.9974 0.9946 0.9862 1.     0.9946 1.     0.9756]
mean-inner-average_precision : 0.992


Inner balanced_accuracy scores: [0.8947 0.9848 0.9474 0.9844 0.9474 0.9581 0.9737 0.93

In [348]:
NestedCV_QDA.performance()

Unnamed: 0,'QuadraticDiscriminantAnalysis' NestedCV Performance
roc_auc,0.991931
F1,0.955235
F1_macro,0.965417
precision,0.984737
recall,0.928571
average_precision,0.984895
balanced_accuracy,0.960119
accuracy,0.968421
matthews_corrcoef,0.932526


In [349]:
NestedCV_QDA.best_hp()

Unnamed: 0_level_0,Best Hyperparameters
Unnamed: 0_level_1,reg_param
Outer Fold 1,0.2
Outer Fold 2,0.4
Outer Fold 3,0.4
Outer Fold 4,0.2
Outer Fold 5,0.6
Outer Fold 6,0.7
Outer Fold 7,0.5
Outer Fold 8,0.3
Outer Fold 9,0.0
Outer Fold 10,0.6


In [359]:
%%time

# Multi-layer Perceptron (MLP)
from sklearn.neural_network import MLPClassifier

MLP_pipe = Pipeline([
   # ('StepAICc', StepAICc()),
    ('Scaler', StandardScaler()),
    ('classifier', MLPClassifier())
    ])


param_grid = {
    'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
    'classifier__activation': ['logistic', 'tanh', 'relu'],
    'classifier__solver': ['sgd', 'adam'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    #'classifier__batch_size': ['auto', 32, 64, 128],
    #'classifier__learning_rate': ['constant', 'invscaling', 'adaptive'],
    #'classifier__learning_rate_init': [0.001, 0.01],
    #'classifier__momentum': [0.0, 0.2],
    #'classifier__validation_fraction': [0.1, 0.2],
}



NestedCV_MLP = NestedCV(innercv=10, outercv=10)


NestedCV_MLP.fit(X, y,
           MLP_pipe,
           param_grid,
           njobs)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.9856 1.     0.9967 0.9984 1.     0.9951 1.     1.     1.     0.9786]
mean-inner-roc_auc : 0.995


Inner F1 scores: [0.973  0.9268 0.9444 0.973  1.     0.9474 1.     0.973  1.     0.9189]
mean-inner-F1 : 0.966


Inner F1_macro scores: [0.979  0.9396 0.9571 0.9788 1.     0.9581 1.     0.9788 1.     0.9364]
mean-inner-F1_macro : 0.973


Inner precision scores: [1.     0.8636 1.     1.     1.     0.9474 1.     1.     1.     0.9444]
mean-inner-precision : 0.976


Inner recall scores: [0.9474 1.     0.8947 0.9474 1.     0.9474 1.     0.9474 1.     0.8947]
mean-inner-recall : 0.958


Inner average_precision scores: [0.9831 1.     0.995  0.9974 1.     0.9928 1.     1.     1.     0.9735]
mean-inner-average_precision : 0.994


Inner balanced_accuracy scores: [0.9737 0.9545 0.9474 0.9737 1.     0.9581 1.     0.97

In [360]:
NestedCV_MLP.performance()

Unnamed: 0,'MLPClassifier' NestedCV Performance
roc_auc,0.993253
F1,0.958971
F1_macro,0.967743
precision,0.971385
recall,0.947835
average_precision,0.991439
balanced_accuracy,0.965545
accuracy,0.970144
matthews_corrcoef,0.936262


In [361]:
NestedCV_MLP.best_hp()

Unnamed: 0_level_0,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters
Unnamed: 0_level_1,activation,alpha,hidden_layer_sizes,solver
Outer Fold 1,tanh,0.0001,"(50, 50)",adam
Outer Fold 2,tanh,0.01,"(50, 50)",adam
Outer Fold 3,tanh,0.0001,"(50, 50)",sgd
Outer Fold 4,tanh,0.0001,"(50, 50)",sgd
Outer Fold 5,tanh,0.0001,"(100, 100)",adam
Outer Fold 6,tanh,0.0001,"(50,)",adam
Outer Fold 7,relu,0.0001,"(50,)",adam
Outer Fold 8,tanh,0.0001,"(50,)",adam
Outer Fold 9,relu,0.0001,"(50, 50)",sgd
Outer Fold 10,tanh,0.001,"(50, 50)",sgd


## $\bullet $ Transform any R model ($\texttt{lm, glm, aov,}\dots$) $\to$ $\texttt{scikit-learn class type}$

In [44]:
r = robjects.r
source = r['source']("glmnet_models.R")

def pandas2R(df):
    pandas2ri.activate()

    # Convert the pandas DataFrame to an R data.frame
    r_data_frame = pandas2ri.py2rpy(df)

    # Check the R data.frame
    #print(r_data_frame)

    # If you want to use the R data.frame in R functions
    ro.r.assign('R_df', r_data_frame)
    
    return ro.r('R_df')

data_r = pandas2R(data)
#X_r = ro.r('R_DF')[ro.r('R_DF').columns[:-1]]
#y_r = ro.r('R_DF')[[ro.r('R_DF').columns[-1]]]

In [58]:
r['glm_model'](data=data_r,
               y='y',
               family='binomial',
               newdata=X)[1:10]

array([  2512140.93369268, -12194887.10861579,   1823303.49578046,
       -11858761.41782527,  -2289537.86386269,   1737984.24316869,
         1106211.87055824,  -1154034.98559833,  -9162976.193269  ])

In [59]:
glm_coef = r('''
        library(glmnet)

        glm_model <- function (data, y, family) {

        #Binary MLE regression
        if (family=='MLE') {

            model <- suppressWarnings({
                        glm(as.formula(paste(y, '~.')),
                            data=data,
                            family=binomial(link='logit'))
                        })

            return (model$coef)

        } else if (family=='gaussian') {

            model <- glm(as.formula(paste(y, '~.')),
                         data=data, family='gaussian')

            return (model$coef)
        } else if (family=='binomial') {

            model <- glm(as.formula(paste(y, '~.')),
                         data=data, family='binomial')

            return (model$coef)
        }   
    }
        ''')

glm_coef(data, 'y', 'binomial')

array([-2.88129530e+06,  2.42700530e+06,  1.95782547e+05,  1.47318370e+06,
       -1.30117798e+05, -1.52451790e+08, -6.42836868e+06,  1.04155049e+06,
       -1.71568125e+07,  4.04858144e+07, -4.23290619e+07,  3.32847253e+07,
        6.36837486e+06,  1.70071139e+06, -6.39344697e+05,  7.49170094e+08,
       -1.77307164e+08,  1.52864353e+08, -1.25985047e+09,  2.89010086e+08,
        1.51209933e+09, -6.13021470e+06, -5.83244138e+05, -3.53819425e+05,
        8.95042203e+04, -2.16112183e+07,  8.98631218e+06, -3.02791934e+07,
        1.43130036e+08, -2.47358298e+07, -3.69831407e+07])

In [60]:
glm = r('''
    library(glmnet)

    glm_model <- function (data, y, family, newdata) {

        #Binary MLE regression
        if (family=='MLE') {

            model <- suppressWarnings({
                        glm(as.formula(paste(y, '~.')),
                            data=data,
                            family=binomial(link='logit'))
                        })

            return (as.numeric(predict(model, newdata=newdata)))

        } else if (family=='gaussian') {

            model <- glm(as.formula(paste(y, '~.')),
                         data=data, family='gaussian')

            return (as.numeric(predict(model, newdata=newdata)))
        } else if (family=='binomial') {

            model <- glm(as.formula(paste(y, '~.')),
                         data=data, family='binomial')

            return (as.numeric(predict(model, newdata=newdata)))
        }   
    }
        ''')
glm(data=data, y='y', family='gaussian', newdata=data)[2:9]

array([1.13131986, 1.19228819, 0.8344466 , 0.69886256, 0.8497074 ,
       0.63199112, 0.69821005])

In [1113]:
"""import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin

class RandomClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, seed=None):
        self.seed = seed

    def fit(self, X, y):
        np.random.seed(self.seed)
        self.classes_ = np.unique(y)
        return self

    def predict(self, X):
        np.random.seed(self.seed)
        n_samples = X.shape[0]
        y_pred = np.random.choice(self.classes_, size=n_samples)
        return y_pred

    def predict_proba(self, X):
        np.random.seed(self.seed)
        n_samples = X.shape[0]
        proba = np.random.rand(n_samples, len(self.classes_))
        return proba / proba.sum(axis=1, keepdims=True)
"""
print('example of how to turn a classifier to scikit-learn class type (cls example: Random-cls --^)')

example of how to turn a classifier to scikit-learn class type (cls example: Random-cls --^)


In [21]:
import numpy as np
import pandas as pd
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.packages import importr
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import (f1_score, roc_auc_score,
                             recall_score, precision_score,
                             balanced_accuracy_score)
pandas2ri.activate()
glmnet = importr("glmnet")

class GLM_Classifier(BaseEstimator, RegressorMixin):
    def __init__(self, family="gaussian"):
        self.family = family

    def fit(self, X, y):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        if isinstance(y, np.ndarray):
            y = pd.Series(y)

        data = X.copy()
        data["target"] = y

        self.formula_ = "target ~ ."
        self.family_ = self.family
        self.data_ = data
        self.classes_ = np.unique(y)

        return self


    def predict_proba(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)

        r_glm_model = r(f'''
                glm_model <- function (data, formula, family, newdata) {{
                    model <- glm(as.formula(formula),
                             data=data,
                             family=family)
                    return (as.numeric(predict(model, newdata=newdata)))
            }}
        ''')

        y_pred = np.array(
            r_glm_model(
                data=self.data_,
                formula=self.formula_,
                family=self.family_,
                newdata=X,
            )
        )
        
        #map it into 2 dim prob for class 1, prob for class 2
        y_pred = np.array(list(map(lambda x: [abs(1-x), x], y_pred)))
        
        return y_pred
    
    def predict(self, X):
        binary_pred = np.array(list(map(lambda x: 1 if x>=0.5 else 0,
                                       self.predict_proba(X)[:, 1])))
        return binary_pred
    
    def score(self, X, y, metric='roc_auc'):
        # Create a dictionary of available metrics
        metrics = {
            'f1': f1_score,
            'roc_auc': roc_auc_score,
            'recall': recall_score,
            'precision': precision_score,
            'balanced_accuracy': balanced_accuracy_score
        }

        # Check if the specified metric is valid
        if metric not in metrics:
            raise ValueError(f"Invalid metric '{metric}', choose from {list(metrics.keys())}")

        # Calculate and return the score using the specified metric
        y_pred = self.predict(X)
        if metric=='roc_auc':
            y_pred = self.predict_proba(X)[:, 1]
        score_func = metrics[metric]
        
        if self.family!='gaussian':
            return score_func(y, 1-y_pred)
        return score_func(y, y_pred)


In [22]:
m = GLM_Classifier(family='gaussian')
m.fit(np.array(X),np.ravel(y))

In [72]:
m.score(np.array(X),np.ravel(y))

0.9965250251043813

In [73]:
m.predict(np.array(X))[1:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1])

## NestedCV for glm models?

# glm~Gaussian nestedCV:

In [1123]:
%%time

glm_gaussian_pipe = Pipeline([
    ('StepAICc', StepAICc()),
    ('Scaler', StandardScaler()),
    ('classifier', GLM_Classifier(family='gaussian'))
    ])


param_grid_glm = {
    #'classifier__family': ('gaussian', 'binomial'),
    'StepAICc__direction': ('both', 'forward', 'backward')
}


NestedCV_glm_gaussian = NestedCV(innercv=10, outercv=10)


NestedCV_glm_gaussian.fit(X, y,
           glm_gaussian_pipe,
           param_grid_glm)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.5638 0.6085 0.5896 0.6053 0.5847 0.5789 0.5263 0.6949 0.7738 0.6579]
mean-inner-roc_auc : 0.618


Inner F1 scores: [0.2609 0.4516 0.3333 0.3478 0.3704 0.2727 0.1    0.5714 0.7097 0.48  ]
mean-inner-F1 : 0.390


Inner F1_macro scores: [0.5255 0.6094 0.5641 0.579  0.5719 0.5364 0.4402 0.7046 0.7915 0.6556]
mean-inner-F1_macro : 0.598


Inner precision scores: [0.75   0.5833 0.8    1.     0.625  1.     1.     0.8889 0.9167 1.    ]
mean-inner-precision : 0.856


Inner recall scores: [0.1579 0.3684 0.2105 0.2105 0.2632 0.1579 0.0526 0.4211 0.5789 0.3158]
mean-inner-recall : 0.274


Inner average_precision scores: [0.6193 0.4992 0.61   0.8137 0.6538 0.528  0.5524 0.8587 0.9095 0.7599]
mean-inner-average_precision : 0.680


Inner balanced_accuracy scores: [0.5638 0.6085 0.5896 0.6053 0.5847 0.5789 0.5263 0.69

In [1125]:
NestedCV_glm_gaussian.best_hp()

Unnamed: 0_level_0,Best Hyperparameters
Unnamed: 0_level_1,direction
Outer Fold 1,both
Outer Fold 2,forward
Outer Fold 3,both
Outer Fold 4,backward
Outer Fold 5,both
Outer Fold 6,backward
Outer Fold 7,both
Outer Fold 8,backward
Outer Fold 9,backward
Outer Fold 10,forward


In [1126]:
NestedCV_glm_gaussian.inner_scores(outer_fold=10).round(2)

Unnamed: 0,Inner Fold 1,Inner Fold 2,Inner Fold 3,Inner Fold 4,Inner Fold 5,Inner Fold 6,Inner Fold 7,Inner Fold 8,Inner Fold 9,Inner Fold 10,Mean value
roc_auc,0.73,0.56,0.76,0.55,0.62,0.69,0.63,0.6,0.69,0.57,0.64
F1,0.65,0.26,0.69,0.25,0.4,0.57,0.42,0.38,0.57,0.32,0.45
F1_macro,0.75,0.53,0.78,0.51,0.6,0.7,0.62,0.59,0.7,0.55,0.63
precision,0.91,0.75,1.0,0.6,0.83,0.89,1.0,0.71,0.89,0.67,0.83
recall,0.5,0.16,0.53,0.16,0.26,0.42,0.26,0.26,0.42,0.21,0.32
average_precision,0.75,0.54,0.94,0.57,0.7,0.77,0.76,0.67,0.72,0.66,0.71
balanced_accuracy,0.73,0.56,0.76,0.55,0.62,0.69,0.63,0.6,0.69,0.57,0.64
accuracy,0.79,0.67,0.83,0.65,0.71,0.76,0.73,0.69,0.76,0.67,0.72
matthews_corrcoef,0.56,0.23,0.64,0.16,0.35,0.49,0.43,0.28,0.49,0.22,0.39


# Binary MLE regression NestedCV

In [74]:
%%time

glm_binomial_pipe = Pipeline([
    ('StepAICc', StepAICc()),
    ('Scaler', StandardScaler()),
    ('classifier', GLM_Classifier(family='binomial'))
    ])


param_grid_glm = {
    #'classifier__family': ('gaussian', 'binomial'),
    'StepAICc__direction': ('both', 'forward', 'backward')
}

NestedCV_glm_binomial = NestedCV(innercv=10, outercv=10)


NestedCV_glm_binomial.fit(X, y,
           glm_binomial_pipe,
           param_grid_glm)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.5263 0.5112 0.537  0.5526 0.5107 0.4844 0.5    0.6053 0.6316 0.5526]
mean-inner-roc_auc : 0.541


Inner F1 scores: [0.1    0.0952 0.1818 0.1905 0.0952 0.     0.     0.3478 0.4167 0.1905]
mean-inner-F1 : 0.162


Inner F1_macro scores: [0.4429 0.4332 0.4784 0.4903 0.4303 0.378  0.3855 0.579  0.6186 0.4903]
mean-inner-F1_macro : 0.473


Inner precision scores: [1.     0.5    0.6667 1.     0.5    0.     0.     1.     1.     1.    ]
mean-inner-precision : 0.667


Inner recall scores: [0.0526 0.0526 0.1053 0.1053 0.0526 0.     0.     0.2105 0.2632 0.1053]
mean-inner-recall : 0.095


Inner average_precision scores: [0.5532 0.4655 0.4893 0.8039 0.6001 0.4266 0.4836 0.8303 0.6543 0.7096]
mean-inner-average_precision : 0.602


Inner balanced_accuracy scores: [0.5263 0.5112 0.537  0.5526 0.5107 0.4844 0.5    0.60

In [75]:
NestedCV_glm_binomial.performance()

Unnamed: 0,'GLM_Classifier' NestedCV Performance
roc_auc,0.942965
F1,0.926311
F1_macro,0.941521
precision,0.92179
recall,0.93355
average_precision,0.980689
balanced_accuracy,0.942965
accuracy,0.945551
matthews_corrcoef,0.884814


In [76]:
NestedCV_glm_binomial.best_hp()

Unnamed: 0_level_0,Best Hyperparameters
Unnamed: 0_level_1,direction
Outer Fold 1,backward
Outer Fold 2,forward
Outer Fold 3,backward
Outer Fold 4,backward
Outer Fold 5,backward
Outer Fold 6,both
Outer Fold 7,both
Outer Fold 8,backward
Outer Fold 9,backward
Outer Fold 10,backward


# NestedCV for $\texttt{cv.glmnet() ElasticNet}$

In [11]:
import numpy as np
import pandas as pd
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.packages import importr
from sklearn.base import BaseEstimator, ClassifierMixin

pandas2ri.activate()
glmnet = importr("glmnet")

class CVGlmnetClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, family="gaussian", alpha=0.5, random_state=False):
        self.family = family
        self.alpha = alpha
        self.random_state = random_state

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = np.array(X)
        if isinstance(y, pd.DataFrame):
            y = np.ravel(y)
            
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        if isinstance(y, np.ndarray):
            y = pd.Series(y)

        self.X_ = X
        self.y_ = y
        self.classes_ = np.unique(y)

        return self

    def predict_proba(self, X):
        np.random.seed(self.random_state)
        
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        
        ### R code for cv.glmnet() ###
        r_cv_glmnet = r(f'''
            cv_glmnet <- function (X, y, family,
                                  alpha, newx, random_state) {{
                                  
                if (family == "binomial") {{
                    set.seed(random_state)
                    model <- cv.glmnet(as.matrix(X),
                                       as.numeric(y),
                                       alpha=alpha,
                                       family="binomial",
                                       type.measure="auc",
                                       nfolds=10,
                                       parallel=TRUE)
                }} else if (family == "gaussian") {{
                    set.seed(random_state)
                    model <- cv.glmnet(as.matrix(X),
                                       as.numeric(y),
                                       alpha=alpha,
                                       family="gaussian",
                                       nfolds=10,
                                       parallel=TRUE)
                }}
                return (as.numeric(predict(model,
                                           newx=as.matrix(newx),
                                           type="response")))
            }}
        ''')

        y_pred = np.array(
            r_cv_glmnet(
                X=self.X_,
                y=self.y_,
                family=self.family,
                alpha=self.alpha,
                newx=X,
                random_state=self.random_state))
        
        y_pred = np.array(list(map(lambda x: [abs(1-x), x], y_pred)))
        
        return y_pred
    
    def predict(self, X):
        binary_pred = np.array(list(map(lambda x: 1 if x>=0.5 else 0,
                                       self.predict_proba(X)[:, 1])))
        return binary_pred

    
    def score(self, X, y, metric='roc_auc'):
        # Create a dictionary of available metrics
        metrics = {
            'f1': f1_score,
            'roc_auc': roc_auc_score,
            'recall': recall_score,
            'precision': precision_score,
            'balanced_accuracy': balanced_accuracy_score
        }

        # Check if the specified metric is valid
        if metric not in metrics:
            raise ValueError(f"Invalid metric '{metric}', choose from {list(metrics.keys())}")

        # Calculate and return the score using the specified metric
        y_pred = self.predict(X)
        if metric=='roc_auc':
            y_pred = self.predict_proba(X)[:, 1]
        score_func = metrics[metric]
        

        return score_func(y, y_pred)


In [12]:
cv_model = CVGlmnetClassifier(family='binomial', alpha=0.6, random_state=2)
cv_model.fit(X, y)

In [13]:
cv_model.predict_proba(X)

array([[7.58281029e-05, 9.99924172e-01],
       [7.86485354e-03, 9.92135146e-01],
       [5.67245279e-04, 9.99432755e-01],
       ...,
       [1.04585579e-01, 8.95414421e-01],
       [3.99685340e-06, 9.99996003e-01],
       [9.98764251e-01, 1.23574890e-03]])

In [14]:
cv_model.score(X, y)

0.9956001268431902

# NestedCV for $\texttt{cv.glmnet(family=gaussian)}$ (full model)

In [15]:
%%time

cvglmnet_binomial_pipe_full = Pipeline([
    ('Scaler', StandardScaler()),
    ('classifier', CVGlmnetClassifier(family='binomial',
                                      random_state=5666))
    ])


param_grid_glmnet_full = {
     #We optimize 'lambda' on cv.glmnet, we only need to opt. alhpa
    'classifier__alpha': (0, 1)#tuple(np.linspace(0,1,11+1).round(2)),
}

NestedCV_cvglmnet_binomial_full = NestedCV(innercv=10, outercv=10)


NestedCV_cvglmnet_binomial_full.fit(X, y,
           cvglmnet_binomial_pipe_full,
           param_grid_glmnet_full)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.9697 0.9984 0.9901 0.9984 0.9967 0.9819 1.     1.     1.     0.9655]
mean-inner-roc_auc : 0.990


Inner F1 scores: [0.8125 0.973  0.8824 0.973  0.9444 0.8125 0.9444 0.8824 1.     0.9444]
mean-inner-F1 : 0.917


Inner F1_macro scores: [0.8646 0.979  0.9118 0.9788 0.9571 0.8634 0.9571 0.9118 1.     0.9571]
mean-inner-F1_macro : 0.938


Inner precision scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
mean-inner-precision : 1.000


Inner recall scores: [0.6842 0.9474 0.7895 0.9474 0.8947 0.6842 0.8947 0.7895 1.     0.8947]
mean-inner-recall : 0.853


Inner average_precision scores: [0.9533 0.9974 0.9874 0.9974 0.9946 0.9724 1.     1.     1.     0.9656]
mean-inner-average_precision : 0.987


Inner balanced_accuracy scores: [0.8421 0.9737 0.8947 0.9737 0.9474 0.8421 0.9474 0.8947 1.     0.9474]
mean-inner-balanced_ac

In [86]:
NestedCV_cvglmnet_binomial_full.performance().round(4) #9916

Unnamed: 0,'CVGlmnetClassifier' NestedCV Performance
roc_auc,0.9894
F1,0.9224
F1_macro,0.9412
precision,0.9947
recall,0.8626
average_precision,0.9878
balanced_accuracy,0.9299
accuracy,0.9473
matthews_corrcoef,0.8896


In [18]:
NestedCV_cvglmnet_binomial_full.best_hp()

Unnamed: 0_level_0,Best Hyperparameters
Unnamed: 0_level_1,alpha
Outer Fold 1,0
Outer Fold 2,0
Outer Fold 3,0
Outer Fold 4,0
Outer Fold 5,1
Outer Fold 6,0
Outer Fold 7,1
Outer Fold 8,1
Outer Fold 9,0
Outer Fold 10,0


In [91]:
%%time

cvglmnet_gaussian_pipe_full = Pipeline([
    ('Scaler', StandardScaler()),
    ('classifier', CVGlmnetClassifier(family='gaussian',
                                      random_state=5666))
    ])

param_grid_glmnet_full = {
     #We optimize 'lambda' on cv.glmnet, we only need to opt. alhpa
    'classifier__alpha': (0, 1) #Ridge Lasso
}

NestedCV_cvglmnet_gaussian_full = NestedCV(innercv=10, outercv=10)


NestedCV_cvglmnet_gaussian_full.fit(X, y,
           cvglmnet_gaussian_pipe_full,
           param_grid_glmnet_full,
           njobs=True,
           trace=False)

CPU times: user 17.6 s, sys: 858 ms, total: 18.4 s
Wall time: 25.2 s


In [92]:
NestedCV_cvglmnet_gaussian_full.performance() 

Unnamed: 0,'CVGlmnetClassifier' NestedCV Performance
roc_auc,0.991931
F1,0.930079
F1_macro,0.947062
precision,1.0
recall,0.871861
average_precision,0.990571
balanced_accuracy,0.935931
accuracy,0.952569
matthews_corrcoef,0.900797


In [94]:
NestedCV_cvglmnet_gaussian_full.best_hp()

Unnamed: 0_level_0,Best Hyperparameters
Unnamed: 0_level_1,alpha
Outer Fold 1,0
Outer Fold 2,0
Outer Fold 3,0
Outer Fold 4,0
Outer Fold 5,0
Outer Fold 6,0
Outer Fold 7,0
Outer Fold 8,0
Outer Fold 9,0
Outer Fold 10,0


# NestedCV for $\texttt{cv.glmnet(family=binomial)}$ with $\texttt{stepAICc()}$ inside CV

In [20]:
%%time

cvglmnet_binomial_pipe = Pipeline([
    ('StepAICc', StepAICc()),
    ('Scaler', StandardScaler()),
    ('classifier', CVGlmnetClassifier(family='binomial',
                                      random_state=5666))
    ])


param_grid_glmnet = {
     #We optimize 'lambda' on cv.glmnet, we only need to opt. alhpa
    'classifier__alpha': (0, 0.3, 0.5, 0.7, 1),
    'StepAICc__direction': ('both', 'forward', 'backward')
}

NestedCV_cvglmnet_binomial = NestedCV(innercv=10, outercv=10)

NestedCV_cvglmnet_binomial.fit(X, y,
           cvglmnet_binomial_pipe,
           param_grid_glmnet)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.9888 0.9952 0.9967 1.     0.9967 0.9753 1.     0.9984 1.     0.9753]
mean-inner-roc_auc : 0.993


Inner F1 scores: [0.9143 0.9474 0.9444 1.     0.9474 0.9444 1.     0.9444 0.973  0.9189]
mean-inner-F1 : 0.953


Inner F1_macro scores: [0.9354 0.9585 0.9571 1.     0.9581 0.9571 1.     0.9571 0.9788 0.9364]
mean-inner-F1_macro : 0.964


Inner precision scores: [1.     0.9474 1.     1.     0.9474 1.     1.     1.     1.     0.9444]
mean-inner-precision : 0.984


Inner recall scores: [0.8421 0.9474 0.8947 1.     0.9474 0.8947 1.     0.8947 0.9474 0.8947]
mean-inner-recall : 0.926


Inner average_precision scores: [0.9858 0.9928 0.995  1.     0.9946 0.9768 1.     0.9974 1.     0.9721]
mean-inner-average_precision : 0.991


Inner balanced_accuracy scores: [0.9211 0.9585 0.9474 1.     0.9581 0.9474 1.     0.94

In [90]:
NestedCV_cvglmnet_binomial.performance().round(3)

Unnamed: 0,'CVGlmnetClassifier' NestedCV Performance
roc_auc,0.99
F1,0.953
F1_macro,0.964
precision,0.977
recall,0.933
average_precision,0.99
balanced_accuracy,0.96
accuracy,0.967
matthews_corrcoef,0.929


In [22]:
NestedCV_cvglmnet_binomial.best_hp()

Unnamed: 0_level_0,Best Hyperparameters,Best Hyperparameters
Unnamed: 0_level_1,direction,alpha
Outer Fold 1,forward,1.0
Outer Fold 2,backward,0.3
Outer Fold 3,backward,0.5
Outer Fold 4,both,1.0
Outer Fold 5,backward,0.5
Outer Fold 6,both,0.3
Outer Fold 7,forward,0.3
Outer Fold 8,forward,0.5
Outer Fold 9,backward,0.3
Outer Fold 10,both,0.3


In [24]:
print(NestedCV_cvglmnet_binomial.best_hp().to_latex())

\begin{tabular}{llr}
\toprule
 & \multicolumn{2}{r}{Best Hyperparameters} \\
 & direction & alpha \\
\midrule
Outer Fold 1 & forward & 1.000000 \\
Outer Fold 2 & backward & 0.300000 \\
Outer Fold 3 & backward & 0.500000 \\
Outer Fold 4 & both & 1.000000 \\
Outer Fold 5 & backward & 0.500000 \\
Outer Fold 6 & both & 0.300000 \\
Outer Fold 7 & forward & 0.300000 \\
Outer Fold 8 & forward & 0.500000 \\
Outer Fold 9 & backward & 0.300000 \\
Outer Fold 10 & both & 0.300000 \\
\bottomrule
\end{tabular}



# NestedCV for $\texttt{cv.glmnet(family=gaussian)}$ with $\texttt{stepAICc()}$ inside CV

In [25]:
%%time

cvglmnet_gaussian_pipe = Pipeline([
    ('StepAICc', StepAICc()),
    ('Scaler', StandardScaler()),
    ('classifier', CVGlmnetClassifier(family='gaussian',
                                      random_state=5666))
    ])


NestedCV_cvglmnet_gaussian = NestedCV(innercv=10, outercv=10)


NestedCV_cvglmnet_gaussian.fit(X, y,
                            cvglmnet_gaussian_pipe,
                            param_grid_glmnet)

Inner CV training & hyperparameter tuning on outer fold 1 ...

 _____________________________________________________________________ 

-> Outer fold 1 results:

Inner roc_auc scores: [0.9745 0.9984 0.9918 1.     0.9918 0.9836 1.     1.     1.     0.9589]
mean-inner-roc_auc : 0.990


Inner F1 scores: [0.8125 0.973  0.8824 0.973  0.9143 0.8485 0.9444 0.8824 1.     0.9143]
mean-inner-F1 : 0.914


Inner F1_macro scores: [0.8646 0.979  0.9118 0.9788 0.9348 0.888  0.9571 0.9118 1.     0.9348]
mean-inner-F1_macro : 0.936


Inner precision scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
mean-inner-precision : 1.000


Inner recall scores: [0.6842 0.9474 0.7895 0.9474 0.8421 0.7368 0.8947 0.7895 1.     0.8421]
mean-inner-recall : 0.847


Inner average_precision scores: [0.9634 0.9974 0.9881 1.     0.9868 0.9769 1.     1.     1.     0.9659]
mean-inner-average_precision : 0.988


Inner balanced_accuracy scores: [0.8421 0.9737 0.8947 0.9737 0.9211 0.8684 0.9474 0.8947 1.     0.9211]
mean-inner-balanced_ac

In [89]:
NestedCV_cvglmnet_gaussian.performance().round(3)

Unnamed: 0,'CVGlmnetClassifier' NestedCV Performance
roc_auc,0.987
F1,0.925
F1_macro,0.943
precision,0.994
recall,0.867
average_precision,0.987
balanced_accuracy,0.932
accuracy,0.949
matthews_corrcoef,0.893


In [102]:
pd.concat([NestedCV_cvglmnet_gaussian.best_hp(),
          NestedCV_cvglmnet_binomial.best_hp()],
          axis=1)

Unnamed: 0_level_0,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters
Unnamed: 0_level_1,direction,alpha,direction.1,alpha.1
Outer Fold 1,backward,0.0,forward,1.0
Outer Fold 2,both,0.3,backward,0.3
Outer Fold 3,backward,0.0,backward,0.5
Outer Fold 4,both,0.3,both,1.0
Outer Fold 5,backward,0.0,backward,0.5
Outer Fold 6,backward,0.3,both,0.3
Outer Fold 7,forward,0.0,forward,0.3
Outer Fold 8,backward,0.0,forward,0.5
Outer Fold 9,both,0.7,backward,0.3
Outer Fold 10,backward,0.0,both,0.3


## NestedCV with $\texttt{hqreg()}$

In [19]:
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.packages import importr
from sklearn.base import BaseEstimator, ClassifierMixin

pandas2ri.activate()
hqreg = importr("hqreg")

class CVhqregClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, alpha=0.5, random_state=False):
        self.alpha = alpha
        self.random_state = random_state

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = np.array(X)
        if isinstance(y, pd.DataFrame):
            y = np.ravel(y)
            
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        if isinstance(y, np.ndarray):
            y = pd.Series(y)

        self.X_ = X
        self.y_ = y
        self.classes_ = np.unique(y)

        return self

    def predict_proba(self, X):
        np.random.seed(self.random_state)
        
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        
        ### R code for cv.hqreg() ###
        r_cv_hqreg = r('''
            cv_hqreg <- function (X, y, alpha=0.5,  newx, random_state) {
                
                set.seed(random_state)
                model <-  cv.hqreg(
                            X=as.matrix(X),
                            y=as.numeric(y),
                            nfolds=10, #10fold CV
                            alpha=alpha,
                            method='quantile', #LAD loss
                            seed=5666)

                return (as.numeric(predict(model,
                                           as.matrix(newx))))
                }
        ''')

        y_pred = np.array(
            r_cv_hqreg(
                X=self.X_,
                y=self.y_,
                alpha=self.alpha,
                newx=X,
                random_state=self.random_state))
        
        y_pred = np.array(list(map(lambda x: [abs(1-x), x], y_pred)))
        
        return y_pred
    
    def predict(self, X):
        binary_pred = np.array(list(map(lambda x: 1 if x>=0.5 else 0,
                                       self.predict_proba(X)[:, 1])))
        return binary_pred

    
    def score(self, X, y, metric='roc_auc'):
        # Create a dictionary of available metrics
        metrics = {
            'f1': f1_score,
            'roc_auc': roc_auc_score,
            'recall': recall_score,
            'precision': precision_score,
            'balanced_accuracy': balanced_accuracy_score
        }

        # Check if the specified metric is valid
        if metric not in metrics:
            raise ValueError(f"Invalid metric '{metric}', choose from {list(metrics.keys())}")

        # Calculate and return the score using the specified metric
        y_pred = self.predict(X)
        if metric=='roc_auc':
            y_pred = self.predict_proba(X)[:, 1]
        score_func = metrics[metric]
        
        return score_func(y, y_pred)


In [20]:
h = CVhqregClassifier()
h.fit(X,y)

In [131]:
h.predict_proba(X)

CV fold #1
CV fold #2
CV fold #3
CV fold #4
CV fold #5
CV fold #6
CV fold #7
CV fold #8
CV fold #9
CV fold #10


array([[ 0.21772352,  1.21772352],
       [ 0.28018701,  0.71981299],
       [ 0.01005871,  1.01005871],
       ...,
       [ 0.5418881 ,  0.4581119 ],
       [ 0.42197216,  1.42197216],
       [ 1.19899656, -0.19899656]])

In [None]:
%%time

## Full model LAD ##

LAD_pipe = Pipeline([
    ('Scaler', StandardScaler()),
    ('classifier', CVhqregClassifier(random_state=5666))
    ])

param_grid_LAD = {
    'classifier__alpha': (0, 0.1, 0.5, 0.7, 1),
    }

NestedCV_LAD = NestedCV(innercv=10, outercv=10)


NestedCV_LAD.fit(X, y,
                 LAD_pipe,
                 param_grid_LAD,
                 trace=False)

CV fold #1
CV fold #2
CV fold #3
CV fold #4
CV fold #5
CV fold #6
CV fold #7
CV fold #8
CV fold #9
CV fold #10
CV fold #1
CV fold #2
CV fold #3
CV fold #4
CV fold #5
CV fold #6
CV fold #7
CV fold #8
CV fold #9
CV fold #10
CV fold #1
CV fold #2
CV fold #3
CV fold #4
CV fold #5
CV fold #6
CV fold #7
CV fold #8
CV fold #9
CV fold #10
CV fold #1
CV fold #2
CV fold #3
CV fold #4
CV fold #5
CV fold #6
CV fold #7
CV fold #8
CV fold #9
CV fold #10
CV fold #1
CV fold #2
CV fold #3
CV fold #4
CV fold #5
CV fold #6
CV fold #7
CV fold #8
CV fold #9
CV fold #10
CV fold #1
CV fold #2
CV fold #3
CV fold #4
CV fold #5
CV fold #6
CV fold #7
CV fold #8
CV fold #9
CV fold #10
CV fold #1
CV fold #2
CV fold #3
CV fold #4
CV fold #5
CV fold #6
CV fold #7
CV fold #8
CV fold #9
CV fold #10
CV fold #1
CV fold #2
CV fold #3
CV fold #4
CV fold #5
CV fold #6
CV fold #7
CV fold #8
CV fold #9
CV fold #10
CV fold #1
CV fold #2
CV fold #3
CV fold #4
CV fold #5
CV fold #6
CV fold #7
CV fold #8
CV fold #9
CV fold #10
C

In [147]:
NestedCV_LAD.performance()

Unnamed: 0,'CVhqregClassifier' NestedCV Performance
roc_auc,0.990741
F1,0.893621
F1_macro,0.920543
precision,1.0
recall,0.810823
average_precision,0.989109
balanced_accuracy,0.905411
accuracy,0.929762
matthews_corrcoef,0.854295


In [148]:
NestedCV_LAD.best_hp() #Ridge

Unnamed: 0_level_0,Best Hyperparameters
Unnamed: 0_level_1,alpha
Outer Fold 1,0.0
Outer Fold 2,0.0
Outer Fold 3,0.0
Outer Fold 4,0.0
Outer Fold 5,0.0
Outer Fold 6,0.1
Outer Fold 7,0.0
Outer Fold 8,0.0
Outer Fold 9,0.0
Outer Fold 10,0.0


In [159]:
NestedCV_LAD.total_models_fitted()*10

5010

# LAD NestedCV with $\texttt{StepAICc()}$

In [None]:
%%time

## LAD with StepAICc() inside the CV ##

LAD_stepAICc_pipe = Pipeline([
    ('StepAICc', StepAICc()),
    ('Scaler', StandardScaler()),
    ('classifier', CVhqregClassifier(random_state=5666))
    ])

param_grid_LAD_stepAICc = {
    'classifier__alpha': (0, 0.1, 0.5, 0.7, 1),
    'StepAICc__direction': ('both', 'forward', 'backward')
    }

NestedCV_LAD_stepAIcc = NestedCV(innercv=10, outercv=10)


NestedCV_LAD_stepAIcc.fit(X, y,
                          LAD_stepAICc_pipe,
                          param_grid_LAD_stepAICc)

In [161]:
pd.concat([NestedCV_LAD.performance(),
           NestedCV_LAD_stepAIcc.performance()],
           axis=1)

Unnamed: 0,'CVhqregClassifier' NestedCV Performance,'CVhqregClassifier' NestedCV Performance.1
roc_auc,0.990741,0.986111
F1,0.893621,0.891779
F1_macro,0.920543,0.91982
precision,1.0,1.0
recall,0.810823,0.810173
average_precision,0.989109,0.985919
balanced_accuracy,0.905411,0.905087
accuracy,0.929762,0.929762
matthews_corrcoef,0.854295,0.85426


In [166]:
pd.concat([NestedCV_LAD.best_hp(),
           NestedCV_LAD_stepAIcc.best_hp()],
           axis=1)

Unnamed: 0_level_0,Best Hyperparameters,Best Hyperparameters,Best Hyperparameters
Unnamed: 0_level_1,alpha,direction,alpha.1
Outer Fold 1,0.0,backward,0.0
Outer Fold 2,0.0,both,0.7
Outer Fold 3,0.0,backward,0.0
Outer Fold 4,0.0,both,0.1
Outer Fold 5,0.0,backward,0.1
Outer Fold 6,0.1,forward,1.0
Outer Fold 7,0.0,forward,0.1
Outer Fold 8,0.0,backward,0.0
Outer Fold 9,0.0,both,0.1
Outer Fold 10,0.0,backward,0.0


In [158]:
NestedCV_LAD_stepAIcc.total_models_fitted()*10

15010