In [4]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

In [6]:
data = load_breast_cancer()
X, y = data.data, data.target

classifiers = {'RandomForest': RandomForestClassifier(random_state = 42), 'AdaBoost': AdaBoostClassifier(random_state = 42)}

param_grids = {
    'RandomForest': {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [None, 10]
    },
    'AdaBoost': {
        'classifier__n_estimators': [50, 100],
        'classifier__learning_rate': [0.01, 0.1]
    }
}

outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Create five splits with the same class balance
final_accuracy = {} # Dictionary to store the final average accuracy for each model


for model_name, classifier in classifiers.items(): # Loop through each model in the classifiers dictionary
    scores = []

    for train_idx, test_idx in outer_cv.split(X, y): # Get the indices for the training and testing sets
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('selector', VarianceThreshold()),
            ('classifier', classifier)
        ])

        inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42) # Create the inner CV loop
        grid_search = GridSearchCV(pipe, param_grid=param_grids[model_name], cv=inner_cv, scoring='accuracy') # Sets up a gridsearch with all parameter combinations
        grid_search.fit(X_train, y_train) # Train the model and tune hyperparameters

        best_model = grid_search.best_estimator_ # Store the model with the best parameters
        test_score = best_model.score(X_test, y_test) # Test the best model on the outer test set
        scores.append(test_score) # Store the test score

    final_accuracy[model_name] = np.mean(scores) # Average the test scores to get final accuracy

print(final_accuracy)






{'RandomForest': 0.9543393882937432, 'AdaBoost': 0.9542928116752056}
